textutils 0.6.7 → 0.6.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +1 -0
- data/lib/textutils.rb +2 -0
- data/lib/textutils/classifier.rb +146 -0
- data/lib/textutils/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
|
4
|
+
data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
|
7
|
+
data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20
|
data/Manifest.txt
CHANGED
data/lib/textutils.rb
CHANGED
@@ -0,0 +1,146 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
class Classifier
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list)
|
11
|
+
end
|
12
|
+
|
13
|
+
def train( key, ary_or_hash_or_str )
|
14
|
+
|
15
|
+
## add words to lang/topic key
|
16
|
+
|
17
|
+
if ary_or_hash_or_str.kind_of?( Array )
|
18
|
+
words = ary_or_hash_or_str
|
19
|
+
elsif ary_or_hash_or_str.kind_of?( Hash )
|
20
|
+
words = []
|
21
|
+
ary_or_hash_or_str.each do |_, values|
|
22
|
+
words += values.strip.split('|')
|
23
|
+
end
|
24
|
+
else # assume string (allow list separated by |)
|
25
|
+
words = ary_or_hash_or_str.strip.split('|')
|
26
|
+
end
|
27
|
+
|
28
|
+
@h[ key ] += words
|
29
|
+
end
|
30
|
+
|
31
|
+
def classify_file( path )
|
32
|
+
classify( File.read_utf8( path ) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def classify( text_with_comments )
|
36
|
+
|
37
|
+
## check encoding
|
38
|
+
logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}"
|
39
|
+
|
40
|
+
# nb: strip comments first
|
41
|
+
text = strip_comments( text_with_comments )
|
42
|
+
|
43
|
+
counts = []
|
44
|
+
## e.g. [[ 'en', 20], # 20 words
|
45
|
+
## [ 'de', 2]] # 2 words
|
46
|
+
|
47
|
+
@h.each_with_index do |(key,words),i|
|
48
|
+
logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
|
49
|
+
counts << [key, count_words_in_text( words, text )]
|
50
|
+
end
|
51
|
+
|
52
|
+
# sort by word count (reverse sort e.g. highest count goes first)
|
53
|
+
counts = counts.sort {|l,r| r[1] <=> l[1] }
|
54
|
+
|
55
|
+
# dump stats
|
56
|
+
|
57
|
+
logger.debug "results:"
|
58
|
+
counts.each_with_index do |entry,i|
|
59
|
+
## e.g. 1. en: 20 words
|
60
|
+
## 2. de: 2 words
|
61
|
+
logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
|
62
|
+
end
|
63
|
+
|
64
|
+
logger.debug "classifier - using key >>#{counts[0][0]}<<"
|
65
|
+
|
66
|
+
## return key/lang code w/ highest count
|
67
|
+
counts[0][0]
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
def dump
|
72
|
+
# for debugging dump setup (that is, keys w/ words etc.)
|
73
|
+
|
74
|
+
@h.each_with_index do |(key, words), i|
|
75
|
+
logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
|
76
|
+
logger.debug words.inspect
|
77
|
+
|
78
|
+
## check encoding of words (trouble w/ windows cp850 argh!!!)
|
79
|
+
last_encoding_name = ''
|
80
|
+
words.each do |word|
|
81
|
+
if last_encoding_name != word.encoding.name
|
82
|
+
logger.debug " encoding: #{word.encoding.name}"
|
83
|
+
last_encoding_name = word.encoding.name
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
def strip_comments( text )
|
92
|
+
new_text = ''
|
93
|
+
|
94
|
+
text.each_line do |line|
|
95
|
+
|
96
|
+
# comments allow:
|
97
|
+
# 1) ##### (shell/ruby style)
|
98
|
+
# 2) -- comment here (haskel/?? style)
|
99
|
+
# 3) % comment here (tex/latex style)
|
100
|
+
|
101
|
+
if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
|
102
|
+
# skip komments and do NOT copy to result (keep comments secret!)
|
103
|
+
logger.debug 'skipping comment line'
|
104
|
+
next
|
105
|
+
end
|
106
|
+
|
107
|
+
## todo: strip inline comments - why not?
|
108
|
+
|
109
|
+
# pass 1) remove possible trailing eol comment
|
110
|
+
## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
|
111
|
+
## becomes -> nyc, New York
|
112
|
+
|
113
|
+
line = line.sub( /\s+#.+$/, '' )
|
114
|
+
|
115
|
+
new_text << line
|
116
|
+
new_text << "\n"
|
117
|
+
end
|
118
|
+
|
119
|
+
new_text
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
def count_word_in_text( word, text )
|
124
|
+
count = 0
|
125
|
+
pos = text.index( word )
|
126
|
+
while pos.nil? == false
|
127
|
+
count += 1
|
128
|
+
logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
|
129
|
+
### todo: check if pos+word.length/size needs +1 or similar
|
130
|
+
pos = text.index( word, pos+word.length)
|
131
|
+
end
|
132
|
+
count
|
133
|
+
end
|
134
|
+
|
135
|
+
def count_words_in_text( words, text )
|
136
|
+
count = 0
|
137
|
+
words.each do |word|
|
138
|
+
count += count_word_in_text( word, text )
|
139
|
+
end
|
140
|
+
count
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end # class Classifier
|
145
|
+
|
146
|
+
end # module TextUtils
|
data/lib/textutils/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- README.markdown
|
65
65
|
- Rakefile
|
66
66
|
- lib/textutils.rb
|
67
|
+
- lib/textutils/classifier.rb
|
67
68
|
- lib/textutils/filter/code_filter.rb
|
68
69
|
- lib/textutils/filter/comment_filter.rb
|
69
70
|
- lib/textutils/filter/erb_django_filter.rb
|