RubyGems - textutils - Versions diffs - 0.6.7 → 0.6.8 - Mend

textutils 0.6.7 → 0.6.8

Files changed (6) hide show

checksums.yaml +4 -4
data/Manifest.txt +1 -0
data/lib/textutils.rb +2 -0
data/lib/textutils/classifier.rb +146 -0
data/lib/textutils/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 526367f0ef97dce6925616ab11b108b1fcddd0d9
-  data.tar.gz: ea3a6c42310cf7ab0a45e1c71cfcd3dc8d5c2d05
+  metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
+  data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
 SHA512:
-  metadata.gz: 5e1ab42c2b9222f2b35513d73221b55c491ef12facac4503b127430f0d086a0e3c25cf57c5f53394fe30e9bb6af8d392c8639e2a5457068810cb2e92ddfc7b31
-  data.tar.gz: bfb33fd98637d1ed90ed82e06fe5c5727c28b781c934de6cea82772a9b7789fe7c70524b6e468fc121ccedc1a96f93a1958cf5951c885cf122d3fcb1b3118185
+  metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
+  data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20

data/Manifest.txt CHANGED Viewed

@@ -3,6 +3,7 @@ Manifest.txt
 README.markdown
 Rakefile
 lib/textutils.rb
+lib/textutils/classifier.rb
 lib/textutils/filter/code_filter.rb
 lib/textutils/filter/comment_filter.rb
 lib/textutils/filter/erb_django_filter.rb

data/lib/textutils.rb CHANGED Viewed

@@ -34,3 +34,5 @@ require 'textutils/reader/line_reader'
 require 'textutils/reader/values_reader'
 require 'textutils/reader/fixture_reader'
+require 'textutils/classifier'

data/lib/textutils/classifier.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# encoding: utf-8
+module TextUtils
+class Classifier
+  include LogUtils::Logging
+  def initialize
+    @h = Hash.new( [] )  # hash w/ words - default value is empty ary (word_list)
+  end
+  def train( key, ary_or_hash_or_str )
+    ## add words to lang/topic key
+    if ary_or_hash_or_str.kind_of?( Array )
+      words = ary_or_hash_or_str
+    elsif ary_or_hash_or_str.kind_of?( Hash )
+      words = []
+      ary_or_hash_or_str.each do |_, values|
+        words += values.strip.split('|')
+      end
+    else  # assume string (allow list separated by |)
+      words = ary_or_hash_or_str.strip.split('|')
+    end
+    @h[ key ] += words
+  end
+  def classify_file( path )
+    classify( File.read_utf8( path ) )
+  end
+  def classify( text_with_comments )
+    ## check encoding
+    logger.debug "  classify - text.encoding: #{text_with_comments.encoding.name}"
+    # nb: strip comments first
+    text = strip_comments( text_with_comments )
+    counts = []
+      ## e.g. [[ 'en', 20], # 20 words
+      ##       [ 'de',  2]] # 2 words
+    @h.each_with_index do |(key,words),i|
+      logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
+      counts << [key, count_words_in_text( words, text )]
+    end
+    # sort by word count (reverse sort e.g. highest count goes first)
+    counts = counts.sort {|l,r| r[1] <=> l[1] }
+    # dump stats
+    logger.debug "results:"
+    counts.each_with_index do |entry,i|
+      ## e.g. 1. en: 20 words
+      ##      2. de: 2 words
+      logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
+    end
+    logger.debug "classifier - using key >>#{counts[0][0]}<<"
+    ## return key/lang code w/ highest count
+    counts[0][0]
+  end
+  def dump
+    # for debugging dump setup (that is, keys w/ words etc.)
+    @h.each_with_index do |(key, words), i|
+      logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
+      logger.debug words.inspect
+      ## check encoding of words (trouble w/ windows cp850 argh!!!)
+      last_encoding_name = ''
+      words.each do |word|
+        if last_encoding_name != word.encoding.name
+          logger.debug "  encoding: #{word.encoding.name}"
+          last_encoding_name = word.encoding.name
+        end
+      end
+    end
+  end
+private
+  def strip_comments( text )
+    new_text = ''
+    text.each_line do |line|
+      # comments allow:
+      # 1) #####  (shell/ruby style)
+      # 2) --  comment here (haskel/?? style)
+      # 3) % comment here (tex/latex style)
+      if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
+        # skip komments and do NOT copy to result (keep comments secret!)
+        logger.debug 'skipping comment line'
+        next
+      end
+      ## todo: strip inline comments  - why not?
+      # pass 1) remove possible trailing eol comment
+      ##  e.g    -> nyc, New York   # Sample EOL Comment Here (with or without commas,,,,)
+      ## becomes -> nyc, New York
+      line = line.sub( /\s+#.+$/, '' )
+      new_text << line
+      new_text << "\n"
+    end
+    new_text
+  end
+  def count_word_in_text( word, text )
+    count = 0
+    pos = text.index( word )
+    while pos.nil? == false
+      count += 1
+      logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
+      ### todo: check if pos+word.length/size needs +1 or similar
+      pos = text.index( word, pos+word.length)
+    end
+    count
+  end
+  def count_words_in_text( words, text )
+    count = 0
+    words.each do |word|
+      count += count_word_in_text( word, text )
+    end
+    count
+  end
+end # class Classifier
+end # module TextUtils

data/lib/textutils/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module TextUtils
-  VERSION = '0.6.7'
+  VERSION = '0.6.8'
 end   # module TextUtils

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: textutils
 version: !ruby/object:Gem::Version
-  version: 0.6.7
+  version: 0.6.8
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-19 00:00:00.000000000 Z
+date: 2013-08-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: logutils
@@ -64,6 +64,7 @@ files:
 - README.markdown
 - Rakefile
 - lib/textutils.rb
+- lib/textutils/classifier.rb
 - lib/textutils/filter/code_filter.rb
 - lib/textutils/filter/comment_filter.rb
 - lib/textutils/filter/erb_django_filter.rb