textutils 0.6.7 → 0.6.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +1 -0
- data/lib/textutils.rb +2 -0
- data/lib/textutils/classifier.rb +146 -0
- data/lib/textutils/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c59b2415860e0f99e9f0a71f8a9ee90f53d62e50
|
4
|
+
data.tar.gz: 4bf60477e5056ecfa8c7c20c8fe9d5a93a0d93e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1703cfd9dd7e0cef58d5e29e718ae2606dae35810b86687f56dc160f3903eb4f7bd6327a42ea51f91f0de51d38f5df062ad9002734c06798d16f4d24f963c748
|
7
|
+
data.tar.gz: 35d720818928c515babb1d3f9ea3a4c34bc793e5de1889b39d8ddad52ddcb3db7bf0b7bc9f19d4413fa457b10e4bc80bbc53097aeed2d755e7815e6b409bfe20
|
data/Manifest.txt
CHANGED
data/lib/textutils.rb
CHANGED
@@ -0,0 +1,146 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module TextUtils
|
4
|
+
|
5
|
+
class Classifier
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@h = Hash.new( [] ) # hash w/ words - default value is empty ary (word_list)
|
11
|
+
end
|
12
|
+
|
13
|
+
def train( key, ary_or_hash_or_str )
|
14
|
+
|
15
|
+
## add words to lang/topic key
|
16
|
+
|
17
|
+
if ary_or_hash_or_str.kind_of?( Array )
|
18
|
+
words = ary_or_hash_or_str
|
19
|
+
elsif ary_or_hash_or_str.kind_of?( Hash )
|
20
|
+
words = []
|
21
|
+
ary_or_hash_or_str.each do |_, values|
|
22
|
+
words += values.strip.split('|')
|
23
|
+
end
|
24
|
+
else # assume string (allow list separated by |)
|
25
|
+
words = ary_or_hash_or_str.strip.split('|')
|
26
|
+
end
|
27
|
+
|
28
|
+
@h[ key ] += words
|
29
|
+
end
|
30
|
+
|
31
|
+
def classify_file( path )
|
32
|
+
classify( File.read_utf8( path ) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def classify( text_with_comments )
|
36
|
+
|
37
|
+
## check encoding
|
38
|
+
logger.debug " classify - text.encoding: #{text_with_comments.encoding.name}"
|
39
|
+
|
40
|
+
# nb: strip comments first
|
41
|
+
text = strip_comments( text_with_comments )
|
42
|
+
|
43
|
+
counts = []
|
44
|
+
## e.g. [[ 'en', 20], # 20 words
|
45
|
+
## [ 'de', 2]] # 2 words
|
46
|
+
|
47
|
+
@h.each_with_index do |(key,words),i|
|
48
|
+
logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words"
|
49
|
+
counts << [key, count_words_in_text( words, text )]
|
50
|
+
end
|
51
|
+
|
52
|
+
# sort by word count (reverse sort e.g. highest count goes first)
|
53
|
+
counts = counts.sort {|l,r| r[1] <=> l[1] }
|
54
|
+
|
55
|
+
# dump stats
|
56
|
+
|
57
|
+
logger.debug "results:"
|
58
|
+
counts.each_with_index do |entry,i|
|
59
|
+
## e.g. 1. en: 20 words
|
60
|
+
## 2. de: 2 words
|
61
|
+
logger.debug " #{i+1}. #{entry[0]}: #{entry[1]}"
|
62
|
+
end
|
63
|
+
|
64
|
+
logger.debug "classifier - using key >>#{counts[0][0]}<<"
|
65
|
+
|
66
|
+
## return key/lang code w/ highest count
|
67
|
+
counts[0][0]
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
def dump
|
72
|
+
# for debugging dump setup (that is, keys w/ words etc.)
|
73
|
+
|
74
|
+
@h.each_with_index do |(key, words), i|
|
75
|
+
logger.debug "key #{key} (#{i+1}/#{@h.size}) - #{words.size} words:"
|
76
|
+
logger.debug words.inspect
|
77
|
+
|
78
|
+
## check encoding of words (trouble w/ windows cp850 argh!!!)
|
79
|
+
last_encoding_name = ''
|
80
|
+
words.each do |word|
|
81
|
+
if last_encoding_name != word.encoding.name
|
82
|
+
logger.debug " encoding: #{word.encoding.name}"
|
83
|
+
last_encoding_name = word.encoding.name
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
def strip_comments( text )
|
92
|
+
new_text = ''
|
93
|
+
|
94
|
+
text.each_line do |line|
|
95
|
+
|
96
|
+
# comments allow:
|
97
|
+
# 1) ##### (shell/ruby style)
|
98
|
+
# 2) -- comment here (haskel/?? style)
|
99
|
+
# 3) % comment here (tex/latex style)
|
100
|
+
|
101
|
+
if line =~ /^\s*#/ || line =~ /^\s*--/ || line =~ /^\s*%/
|
102
|
+
# skip komments and do NOT copy to result (keep comments secret!)
|
103
|
+
logger.debug 'skipping comment line'
|
104
|
+
next
|
105
|
+
end
|
106
|
+
|
107
|
+
## todo: strip inline comments - why not?
|
108
|
+
|
109
|
+
# pass 1) remove possible trailing eol comment
|
110
|
+
## e.g -> nyc, New York # Sample EOL Comment Here (with or without commas,,,,)
|
111
|
+
## becomes -> nyc, New York
|
112
|
+
|
113
|
+
line = line.sub( /\s+#.+$/, '' )
|
114
|
+
|
115
|
+
new_text << line
|
116
|
+
new_text << "\n"
|
117
|
+
end
|
118
|
+
|
119
|
+
new_text
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
def count_word_in_text( word, text )
|
124
|
+
count = 0
|
125
|
+
pos = text.index( word )
|
126
|
+
while pos.nil? == false
|
127
|
+
count += 1
|
128
|
+
logger.debug "bingo - found >>#{word}<< on pos #{pos}, count: #{count}"
|
129
|
+
### todo: check if pos+word.length/size needs +1 or similar
|
130
|
+
pos = text.index( word, pos+word.length)
|
131
|
+
end
|
132
|
+
count
|
133
|
+
end
|
134
|
+
|
135
|
+
def count_words_in_text( words, text )
|
136
|
+
count = 0
|
137
|
+
words.each do |word|
|
138
|
+
count += count_word_in_text( word, text )
|
139
|
+
end
|
140
|
+
count
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end # class Classifier
|
145
|
+
|
146
|
+
end # module TextUtils
|
data/lib/textutils/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- README.markdown
|
65
65
|
- Rakefile
|
66
66
|
- lib/textutils.rb
|
67
|
+
- lib/textutils/classifier.rb
|
67
68
|
- lib/textutils/filter/code_filter.rb
|
68
69
|
- lib/textutils/filter/comment_filter.rb
|
69
70
|
- lib/textutils/filter/erb_django_filter.rb
|