RubyGems - textutils - Versions diffs - 0.6.7 → 0.6.8 - Mend

textutils 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/Manifest.txt +1 -0
data/lib/textutils.rb +2 -0
data/lib/textutils/classifier.rb +146 -0
data/lib/textutils/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 526367f0ef97dce6925616ab11b108b1fcddd0d9
-  data.tar.gz: ea3a6c42310cf7ab0a45e1c71cfcd3dc8d5c2d05
+  metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
+  data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
 SHA512:
-  metadata.gz: 5e1ab42c2b9222f2b35513d73221b55c491ef12facac4503b127430f0d086a0e3c25cf57c5f53394fe30e9bb6af8d392c8639e2a5457068810cb2e92ddfc7b31
-  data.tar.gz: bfb33fd98637d1ed90ed82e06fe5c5727c28b781c934de6cea82772a9b7789fe7c70524b6e468fc121ccedc1a96f93a1958cf5951c885cf122d3fcb1b3118185
+  metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
+  data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20

data/Manifest.txt CHANGED Viewed

@@ -3,6 +3,7 @@ Manifest.txt
 README.markdown
 Rakefile
 lib/textutils.rb
+lib/textutils/classifier.rb
 lib/textutils/filter/code_filter.rb
 lib/textutils/filter/comment_filter.rb
 lib/textutils/filter/erb_django_filter.rb

data/lib/textutils.rb CHANGED Viewed

@@ -34,3 +34,5 @@ require 'textutils/reader/line_reader'
 require 'textutils/reader/values_reader'
 require 'textutils/reader/fixture_reader'
+require 'textutils/classifier'

data/lib/textutils/classifier.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# encoding: utf-8
+module TextUtils
+class Classifier
+  include LogUtils::Logging
+  def initialize
+    @h = Hash.new( [] )  # hash w/ words - default value is empty ary (word_list)
+  end
+  def train( key, ary_or_hash_or_str )
+    ## add words to lang/topic key
+    if ary_or_hash_or_str.kind_of?( Array )
+      words = ary_or_hash_or_str
+    elsif ary_or_hash_or_str.kind_of?( Hash )
+      words = []
+      ary_or_hash_or_str.each do |_, values|
+        words += values.strip.split('|')
+      end
+    else  # assume string (allow list separated by |)
+      words = ary_or_hash_or_str.strip.split('|')
+    end
+    @h[ key ] += words
+  end
+  def classify_file( path )
+    classify( File.read_utf8( path ) )
+  end
+  def classify( text_with_comments )
+    ## check encoding
+    logger.debug "  classify - text.encoding: #{text_with_comments.encoding.name}"
+    # nb: strip comments first
+    text = strip_comments( text_with_comments )
+    counts = []
+      ## e.g. [[ 'en', 20], # 20 words
+      ##       [ 'de',  2]] # 2 words
+    @h.each_with_index do |(key,words),i|
+      logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
+      counts << [key, count_words_in_text( words, text )]
+    end
+    # sort by word count (reverse sort e.g. highest count goes first)
+    counts = counts.sort {|l,r| r[1] <=> l[1] }
+    # dump stats
+    logger.debug "results:"
+    counts.each_with_index do |entry,i|
+      ## e.g. 1. en: 20 words
+      ##      2. de: 2 words
+      logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
+    end
+    logger.debug "classifier - using key >>#{counts[0][0]}<<"
+    ## return key/lang code w/ highest count
+    counts[0][0]
+  end
+  def dump
+    # for debugging dump setup (that is, keys w/ words etc.)
+    @h.each_with_index do |(key, words), i|
+      logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
+      logger.debug words.inspect
+      ## check encoding of words (trouble w/ windows cp850 argh!!!)
+      last_encoding_name = ''
+      words.each do |word|
+        if last_encoding_name != word.encoding.name
+          logger.debug "  encoding: #{word.encoding.name}"
+          last_encoding_name = word.encoding.name
+        end
+      end
+    end
+  end
+private
+  def strip_comments( text )
+    new_text = ''
+    text.each_line do |line|
+      # comments allow:
+      # 1) #####  (shell/ruby style)
+      # 2) --  comment here (haskel/?? style)
+      # 3) % comment here (tex/latex style)
+      if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
+        # skip komments and do NOT copy to result (keep comments secret!)
+        logger.debug 'skipping comment line'
+        next
+      end
+      ## todo: strip inline comments  - why not?
+      # pass 1) remove possible trailing eol comment
+      ##  e.g    -> nyc, New York   # Sample EOL Comment Here (with or without commas,,,,)
+      ## becomes -> nyc, New York
+      line = line.sub( /\s+#.+$/, '' )
+      new_text << line
+      new_text << "\n"
+    end
+    new_text
+  end
+  def count_word_in_text( word, text )
+    count = 0
+    pos = text.index( word )
+    while pos.nil? == false
+      count += 1
+      logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
+      ### todo: check if pos+word.length/size needs +1 or similar
+      pos = text.index( word, pos+word.length)
+    end
+    count
+  end
+  def count_words_in_text( words, text )
+    count = 0
+    words.each do |word|
+      count += count_word_in_text( word, text )
+    end
+    count
+  end
+end # class Classifier
+end # module TextUtils

data/lib/textutils/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module TextUtils
-  VERSION = '0.6.7'
+  VERSION = '0.6.8'
 end   # module TextUtils

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: textutils
 version: !ruby/object:Gem::Version
-  version: 0.6.7
+  version: 0.6.8
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-19 00:00:00.000000000 Z
+date: 2013-08-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: logutils
@@ -64,6 +64,7 @@ files:
 - README.markdown
 - Rakefile
 - lib/textutils.rb
+- lib/textutils/classifier.rb
 - lib/textutils/filter/code_filter.rb
 - lib/textutils/filter/comment_filter.rb
 - lib/textutils/filter/erb_django_filter.rb