textutils 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 526367f0ef97dce6925616ab11b108b1fcddd0d9
4
- data.tar.gz: ea3a6c42310cf7ab0a45e1c71cfcd3dc8d5c2d05
3
+ metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
4
+ data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
5
5
  SHA512:
6
- metadata.gz: 5e1ab42c2b9222f2b35513d73221b55c491ef12facac4503b127430f0d086a0e3c25cf57c5f53394fe30e9bb6af8d392c8639e2a5457068810cb2e92ddfc7b31
7
- data.tar.gz: bfb33fd98637d1ed90ed82e06fe5c5727c28b781c934de6cea82772a9b7789fe7c70524b6e468fc121ccedc1a96f93a1958cf5951c885cf122d3fcb1b3118185
6
+ metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
7
+ data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20
data/Manifest.txt CHANGED
@@ -3,6 +3,7 @@ Manifest.txt
3
3
  README.markdown
4
4
  Rakefile
5
5
  lib/textutils.rb
6
+ lib/textutils/classifier.rb
6
7
  lib/textutils/filter/code_filter.rb
7
8
  lib/textutils/filter/comment_filter.rb
8
9
  lib/textutils/filter/erb_django_filter.rb
data/lib/textutils.rb CHANGED
@@ -34,3 +34,5 @@ require 'textutils/reader/line_reader'
34
34
  require 'textutils/reader/values_reader'
35
35
  require 'textutils/reader/fixture_reader'
36
36
 
37
+ require 'textutils/classifier'
38
+
@@ -0,0 +1,146 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ class Classifier
6
+
7
+ include LogUtils::Logging
8
+
9
+ def initialize
10
+ @h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list)
11
+ end
12
+
13
+ def train( key, ary_or_hash_or_str )
14
+
15
+ ## add words to lang/topic key
16
+
17
+ if ary_or_hash_or_str.kind_of?( Array )
18
+ words = ary_or_hash_or_str
19
+ elsif ary_or_hash_or_str.kind_of?( Hash )
20
+ words = []
21
+ ary_or_hash_or_str.each do |_, values|
22
+ words += values.strip.split('|')
23
+ end
24
+ else # assume string (allow list separated by |)
25
+ words = ary_or_hash_or_str.strip.split('|')
26
+ end
27
+
28
+ @h[ key ] += words
29
+ end
30
+
31
+ def classify_file( path )
32
+ classify( File.read_utf8( path ) )
33
+ end
34
+
35
+ def classify( text_with_comments )
36
+
37
+ ## check encoding
38
+ logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}"
39
+
40
+ # nb: strip comments first
41
+ text = strip_comments( text_with_comments )
42
+
43
+ counts = []
44
+ ## e.g. [[ 'en', 20], # 20 words
45
+ ## [ 'de', 2]] # 2 words
46
+
47
+ @h.each_with_index do |(key,words),i|
48
+ logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
49
+ counts << [key, count_words_in_text( words, text )]
50
+ end
51
+
52
+ # sort by word count (reverse sort e.g. highest count goes first)
53
+ counts = counts.sort {|l,r| r[1] <=> l[1] }
54
+
55
+ # dump stats
56
+
57
+ logger.debug "results:"
58
+ counts.each_with_index do |entry,i|
59
+ ## e.g. 1. en: 20 words
60
+ ## 2. de: 2 words
61
+ logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
62
+ end
63
+
64
+ logger.debug "classifier - using key >>#{counts[0][0]}<<"
65
+
66
+ ## return key/lang code w/ highest count
67
+ counts[0][0]
68
+ end
69
+
70
+
71
+ def dump
72
+ # for debugging dump setup (that is, keys w/ words etc.)
73
+
74
+ @h.each_with_index do |(key, words), i|
75
+ logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
76
+ logger.debug words.inspect
77
+
78
+ ## check encoding of words (trouble w/ windows cp850 argh!!!)
79
+ last_encoding_name = ''
80
+ words.each do |word|
81
+ if last_encoding_name != word.encoding.name
82
+ logger.debug " encoding: #{word.encoding.name}"
83
+ last_encoding_name = word.encoding.name
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+
90
+ private
91
+ def strip_comments( text )
92
+ new_text = ''
93
+
94
+ text.each_line do |line|
95
+
96
+ # comments allow:
97
+ # 1) ##### (shell/ruby style)
98
+ # 2) -- comment here (haskel/?? style)
99
+ # 3) % comment here (tex/latex style)
100
+
101
+ if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
102
+ # skip komments and do NOT copy to result (keep comments secret!)
103
+ logger.debug 'skipping comment line'
104
+ next
105
+ end
106
+
107
+ ## todo: strip inline comments - why not?
108
+
109
+ # pass 1) remove possible trailing eol comment
110
+ ## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
111
+ ## becomes -> nyc, New York
112
+
113
+ line = line.sub( /\s+#.+$/, '' )
114
+
115
+ new_text << line
116
+ new_text << "\n"
117
+ end
118
+
119
+ new_text
120
+ end
121
+
122
+
123
+ def count_word_in_text( word, text )
124
+ count = 0
125
+ pos = text.index( word )
126
+ while pos.nil? == false
127
+ count += 1
128
+ logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
129
+ ### todo: check if pos+word.length/size needs +1 or similar
130
+ pos = text.index( word, pos+word.length)
131
+ end
132
+ count
133
+ end
134
+
135
+ def count_words_in_text( words, text )
136
+ count = 0
137
+ words.each do |word|
138
+ count += count_word_in_text( word, text )
139
+ end
140
+ count
141
+ end
142
+
143
+
144
+ end # class Classifier
145
+
146
+ end # module TextUtils
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.6.7'
4
+ VERSION = '0.6.8'
5
5
 
6
6
  end # module TextUtils
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.7
4
+ version: 0.6.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-19 00:00:00.000000000 Z
11
+ date: 2013-08-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils
@@ -64,6 +64,7 @@ files:
64
64
  - README.markdown
65
65
  - Rakefile
66
66
  - lib/textutils.rb
67
+ - lib/textutils/classifier.rb
67
68
  - lib/textutils/filter/code_filter.rb
68
69
  - lib/textutils/filter/comment_filter.rb
69
70
  - lib/textutils/filter/erb_django_filter.rb