textutils 0.6.7 → 0.6.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 526367f0ef97dce6925616ab11b108b1fcddd0d9
4
- data.tar.gz: ea3a6c42310cf7ab0a45e1c71cfcd3dc8d5c2d05
3
+ metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
4
+ data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
5
5
  SHA512:
6
- metadata.gz: 5e1ab42c2b9222f2b35513d73221b55c491ef12facac4503b127430f0d086a0e3c25cf57c5f53394fe30e9bb6af8d392c8639e2a5457068810cb2e92ddfc7b31
7
- data.tar.gz: bfb33fd98637d1ed90ed82e06fe5c5727c28b781c934de6cea82772a9b7789fe7c70524b6e468fc121ccedc1a96f93a1958cf5951c885cf122d3fcb1b3118185
6
+ metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
7
+ data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20
data/Manifest.txt CHANGED
@@ -3,6 +3,7 @@ Manifest.txt
3
3
  README.markdown
4
4
  Rakefile
5
5
  lib/textutils.rb
6
+ lib/textutils/classifier.rb
6
7
  lib/textutils/filter/code_filter.rb
7
8
  lib/textutils/filter/comment_filter.rb
8
9
  lib/textutils/filter/erb_django_filter.rb
data/lib/textutils.rb CHANGED
@@ -34,3 +34,5 @@ require 'textutils/reader/line_reader'
34
34
  require 'textutils/reader/values_reader'
35
35
  require 'textutils/reader/fixture_reader'
36
36
 
37
+ require 'textutils/classifier'
38
+
@@ -0,0 +1,146 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ class Classifier
6
+
7
+ include LogUtils::Logging
8
+
9
+ def initialize
10
+ @h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list)
11
+ end
12
+
13
+ def train( key, ary_or_hash_or_str )
14
+
15
+ ## add words to lang/topic key
16
+
17
+ if ary_or_hash_or_str.kind_of?( Array )
18
+ words = ary_or_hash_or_str
19
+ elsif ary_or_hash_or_str.kind_of?( Hash )
20
+ words = []
21
+ ary_or_hash_or_str.each do |_, values|
22
+ words += values.strip.split('|')
23
+ end
24
+ else # assume string (allow list separated by |)
25
+ words = ary_or_hash_or_str.strip.split('|')
26
+ end
27
+
28
+ @h[ key ] += words
29
+ end
30
+
31
+ def classify_file( path )
32
+ classify( File.read_utf8( path ) )
33
+ end
34
+
35
+ def classify( text_with_comments )
36
+
37
+ ## check encoding
38
+ logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}"
39
+
40
+ # nb: strip comments first
41
+ text = strip_comments( text_with_comments )
42
+
43
+ counts = []
44
+ ## e.g. [[ 'en', 20], # 20 words
45
+ ## [ 'de', 2]] # 2 words
46
+
47
+ @h.each_with_index do |(key,words),i|
48
+ logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
49
+ counts << [key, count_words_in_text( words, text )]
50
+ end
51
+
52
+ # sort by word count (reverse sort e.g. highest count goes first)
53
+ counts = counts.sort {|l,r| r[1] <=> l[1] }
54
+
55
+ # dump stats
56
+
57
+ logger.debug "results:"
58
+ counts.each_with_index do |entry,i|
59
+ ## e.g. 1. en: 20 words
60
+ ## 2. de: 2 words
61
+ logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
62
+ end
63
+
64
+ logger.debug "classifier - using key >>#{counts[0][0]}<<"
65
+
66
+ ## return key/lang code w/ highest count
67
+ counts[0][0]
68
+ end
69
+
70
+
71
+ def dump
72
+ # for debugging dump setup (that is, keys w/ words etc.)
73
+
74
+ @h.each_with_index do |(key, words), i|
75
+ logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
76
+ logger.debug words.inspect
77
+
78
+ ## check encoding of words (trouble w/ windows cp850 argh!!!)
79
+ last_encoding_name = ''
80
+ words.each do |word|
81
+ if last_encoding_name != word.encoding.name
82
+ logger.debug " encoding: #{word.encoding.name}"
83
+ last_encoding_name = word.encoding.name
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+
90
+ private
91
+ def strip_comments( text )
92
+ new_text = ''
93
+
94
+ text.each_line do |line|
95
+
96
+ # comments allow:
97
+ # 1) ##### (shell/ruby style)
98
+ # 2) -- comment here (haskel/?? style)
99
+ # 3) % comment here (tex/latex style)
100
+
101
+ if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
102
+ # skip komments and do NOT copy to result (keep comments secret!)
103
+ logger.debug 'skipping comment line'
104
+ next
105
+ end
106
+
107
+ ## todo: strip inline comments - why not?
108
+
109
+ # pass 1) remove possible trailing eol comment
110
+ ## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
111
+ ## becomes -> nyc, New York
112
+
113
+ line = line.sub( /\s+#.+$/, '' )
114
+
115
+ new_text << line
116
+ new_text << "\n"
117
+ end
118
+
119
+ new_text
120
+ end
121
+
122
+
123
+ def count_word_in_text( word, text )
124
+ count = 0
125
+ pos = text.index( word )
126
+ while pos.nil? == false
127
+ count += 1
128
+ logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
129
+ ### todo: check if pos+word.length/size needs +1 or similar
130
+ pos = text.index( word, pos+word.length)
131
+ end
132
+ count
133
+ end
134
+
135
+ def count_words_in_text( words, text )
136
+ count = 0
137
+ words.each do |word|
138
+ count += count_word_in_text( word, text )
139
+ end
140
+ count
141
+ end
142
+
143
+
144
+ end # class Classifier
145
+
146
+ end # module TextUtils
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.6.7'
4
+ VERSION = '0.6.8'
5
5
 
6
6
  end # module TextUtils
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.7
4
+ version: 0.6.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-19 00:00:00.000000000 Z
11
+ date: 2013-08-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils
@@ -64,6 +64,7 @@ files:
64
64
  - README.markdown
65
65
  - Rakefile
66
66
  - lib/textutils.rb
67
+ - lib/textutils/classifier.rb
67
68
  - lib/textutils/filter/code_filter.rb
68
69
  - lib/textutils/filter/comment_filter.rb
69
70
  - lib/textutils/filter/erb_django_filter.rb