ix-cli 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ix-string-similarity +161 -108
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c5aadde615cdfa81843168a63e691e29be91832d247cb01c769ac6f0be4cf5f
|
4
|
+
data.tar.gz: 178a6b48491c5e5d627390595c60bef6b17c37d150ce02f8ccf59070666b8e2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34a177748f513b36e85a425f02865b2c7803b1d2c59abce5cd79aae31bf7ff992adffdf3e7d4277c9312c6aa90f209fb662dde35cc2752366de0c6c7c14c90ea
|
7
|
+
data.tar.gz: 6b79ddb512d93d28444592dd107076eae1e9c24b0b4da4c9f8c4d8fda5a0be46a69a48d2de0b3afd55bf8f741cc15437137908f736d13c2348581cf43d016c21
|
data/bin/ix-string-similarity
CHANGED
@@ -1,152 +1,205 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
class Chunk
|
4
|
+
attr_accessor :char_1
|
5
|
+
attr_accessor :char_2
|
6
|
+
attr_accessor :index_1
|
7
|
+
attr_accessor :index_2
|
8
|
+
|
9
|
+
def initialize(c1, c2, i1, i2)
|
10
|
+
@char_1 = c1
|
11
|
+
@char_2 = c2
|
12
|
+
@index_1 = i1
|
13
|
+
@index_2 = i2
|
14
|
+
end
|
5
15
|
|
6
|
-
|
16
|
+
def to_s
|
17
|
+
"#{char_1} #{char_2} #{index_1} #{index_2}"
|
18
|
+
end
|
19
|
+
end
|
7
20
|
|
8
|
-
|
21
|
+
class Similarity
|
22
|
+
CHAR_REGEX = /./
|
9
23
|
|
10
|
-
|
11
|
-
|
12
|
-
opts.separator "Usage: #{File.basename($0)} [OPTIONS]"
|
13
|
-
opts.separator ''
|
24
|
+
attr_accessor :string_1
|
25
|
+
attr_accessor :string_2
|
14
26
|
|
15
|
-
|
16
|
-
|
17
|
-
|
27
|
+
def initialize(string_1, string_2)
|
28
|
+
@string_1 = string_1
|
29
|
+
@string_2 = string_2
|
30
|
+
end
|
18
31
|
|
19
|
-
|
20
|
-
|
21
|
-
|
32
|
+
def tokens
|
33
|
+
chunks = []
|
34
|
+
string_1.scan(CHAR_REGEX).each_with_index do |char_1, index_1|
|
35
|
+
string_2.scan(CHAR_REGEX).each_with_index do |char_2, index_2|
|
36
|
+
next if char_1 != char_2
|
37
|
+
chunks.push(Chunk.new(char_1, char_2, index_1, index_2))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
chunks
|
22
41
|
end
|
23
42
|
|
24
|
-
|
25
|
-
|
26
|
-
|
43
|
+
def count
|
44
|
+
counter = 0
|
45
|
+
prev = false
|
46
|
+
tokens.each_with_index do |chunk, index|
|
47
|
+
|
48
|
+
unless prev
|
49
|
+
prev = chunk.index_1
|
50
|
+
next
|
51
|
+
end
|
52
|
+
|
53
|
+
if prev == (chunk.index_1 - 1)
|
54
|
+
counter += 1
|
55
|
+
end
|
56
|
+
|
57
|
+
prev = chunk.index_1
|
58
|
+
end
|
59
|
+
|
60
|
+
counter
|
27
61
|
end
|
28
62
|
|
29
|
-
|
30
|
-
|
31
|
-
|
63
|
+
def score
|
64
|
+
desired = (string_1.size + string_2.size) / 2
|
65
|
+
size_thresh = ([string_1.size, string_2.size].sort.first.to_f / desired)
|
66
|
+
compatibility_thresh = (count.to_f + 1) / string_1.size
|
67
|
+
(size_thresh + compatibility_thresh).to_f / 2
|
32
68
|
end
|
69
|
+
end
|
33
70
|
|
71
|
+
require 'optparse'
|
34
72
|
|
35
|
-
|
73
|
+
options = {}
|
74
|
+
options[:threshold] = 0.8
|
75
|
+
|
76
|
+
OptionParser.new do |opts|
|
77
|
+
|
78
|
+
opts.banner = "Usage: #{$0} [OPTIONS]"
|
79
|
+
|
80
|
+
opts.on('-t', '--threshold [NUMBER]', 'Threshold default value is 0.8.') do |value|
|
81
|
+
options[:threshold] = value.to_f
|
82
|
+
end
|
36
83
|
|
37
84
|
end.parse!
|
38
85
|
|
39
|
-
|
86
|
+
required_options = [:threshold]
|
87
|
+
required_options.each do |option|
|
88
|
+
unless options[option]
|
89
|
+
$stderr.puts "Can not run #{option.to_s} was not given."
|
90
|
+
exit 1
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# hash = {
|
95
|
+
# 'line' => [
|
96
|
+
# { :line => 'line', :score => 1 },
|
97
|
+
# ]
|
98
|
+
# }
|
99
|
+
|
100
|
+
hash = {}
|
101
|
+
lines = 0
|
40
102
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
103
|
+
STDIN.each_line do |line|
|
104
|
+
line.chomp!
|
105
|
+
next if line == ''
|
106
|
+
lines += 1
|
107
|
+
resolved = false
|
108
|
+
hash.keys.each do |registered_line|
|
109
|
+
score = Similarity.new(line, registered_line).score
|
110
|
+
if score > options[:threshold]
|
111
|
+
hash[registered_line].push({
|
112
|
+
:line => line,
|
113
|
+
:score => score
|
114
|
+
})
|
115
|
+
resolved = true
|
45
116
|
end
|
46
117
|
end
|
118
|
+
next if resolved
|
119
|
+
hash[line] ||= []
|
47
120
|
end
|
48
121
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
122
|
+
module Template
|
123
|
+
|
124
|
+
class Banner < Struct.new(:lines, :groups, :threshold, :datetime)
|
125
|
+
def to_s
|
126
|
+
format(template, to_h)
|
127
|
+
end
|
128
|
+
def template
|
129
|
+
'
|
130
|
+
Total Lines Parsed: %<lines>s
|
131
|
+
Total Groups Generated: %<groups>s
|
132
|
+
Similarity Theshold at: %<threshold>s
|
133
|
+
Generated on: %<datetime>s
|
134
|
+
'
|
54
135
|
end
|
55
|
-
array
|
56
136
|
end
|
57
137
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
longest_string = other_string
|
62
|
-
shortest_string = self
|
63
|
-
else
|
64
|
-
longest_string = self
|
65
|
-
shortest_string = other_string
|
66
|
-
end
|
67
|
-
scores = longest_string.to_a.map do |char|
|
68
|
-
0
|
138
|
+
class Group < Struct.new(:number, :percent, :items, :line)
|
139
|
+
def to_s
|
140
|
+
format(template, to_h)
|
69
141
|
end
|
70
|
-
|
71
|
-
|
72
|
-
scores[index] = 1
|
73
|
-
end
|
142
|
+
def template
|
143
|
+
'Group %<number>s represents %<percent>s and has %<items>s items similar to: %<line>s'
|
74
144
|
end
|
75
|
-
scores
|
76
145
|
end
|
77
146
|
|
78
|
-
|
79
|
-
|
147
|
+
class Item < Struct.new(:count, :total, :score, :line)
|
148
|
+
def to_s
|
149
|
+
format(template, to_h)
|
150
|
+
end
|
151
|
+
def template
|
152
|
+
' %<count>s/%<total>s %<score>s %<line>s'
|
153
|
+
end
|
80
154
|
end
|
81
|
-
end
|
82
155
|
|
83
|
-
class TargetString
|
84
|
-
attr_accessor :evaluated
|
85
|
-
attr_accessor :data
|
86
|
-
def to_s
|
87
|
-
data
|
88
|
-
end
|
89
156
|
end
|
90
157
|
|
91
|
-
|
158
|
+
require 'isna'
|
92
159
|
|
93
|
-
|
160
|
+
banner = Template::Banner.new
|
161
|
+
banner.lines = lines.to_s.to_ansi.yellow.to_s
|
162
|
+
banner.groups = hash.keys.size.to_s.to_ansi.yellow.to_s
|
163
|
+
banner.threshold = options[:threshold].to_s.to_ansi.yellow.to_s
|
164
|
+
banner.datetime = Time.now.to_s.to_ansi.yellow.to_s
|
165
|
+
puts banner.to_s
|
94
166
|
|
95
|
-
|
96
|
-
next if line.chomp == ''
|
97
|
-
strings << line.chomp
|
98
|
-
end
|
167
|
+
groups = []
|
99
168
|
|
100
|
-
|
101
|
-
|
169
|
+
hash.each do |category_name, records|
|
170
|
+
groups.push([category_name, records.size])
|
102
171
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
target_string.evaluated = false
|
109
|
-
target_string.data = string
|
110
|
-
target_string
|
172
|
+
|
173
|
+
sorted_groups_by_n_records_asc = groups.sort do |array_a, array_b|
|
174
|
+
number_of_records_in_a = array_a[1]
|
175
|
+
number_of_records_in_b = array_b[1]
|
176
|
+
number_of_records_in_a <=> number_of_records_in_b
|
111
177
|
end
|
112
178
|
|
113
|
-
|
114
|
-
|
115
|
-
|
179
|
+
sorted_groups_by_n_records_asc.reverse.each_with_index do |key, index|
|
180
|
+
line, records = key[0], hash[key[0]]
|
181
|
+
|
182
|
+
puts ''
|
183
|
+
|
184
|
+
group = Template::Group.new
|
185
|
+
group.percent = ('%2.2f%%' % ((records.size.to_f / lines) * 100)).to_s.to_ansi.red.to_s
|
186
|
+
group.number = (index + 1).to_s.to_ansi.red.to_s
|
187
|
+
group.items = records.size.to_s.to_ansi.cyan.to_s
|
188
|
+
group.line = line.chomp.to_ansi.green.to_s
|
189
|
+
puts group.to_s
|
190
|
+
|
191
|
+
sorted_items_in_group = records.sort do |a, b|
|
192
|
+
a[:score] <=> b[:score]
|
116
193
|
end
|
117
|
-
else
|
118
|
-
groups = { 0 => strings }
|
119
|
-
end
|
120
194
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
else
|
129
|
-
puts "****>>" + string_1.to_s
|
130
|
-
end
|
131
|
-
end
|
132
|
-
string_1.evaluated = true
|
133
|
-
group.each do |string_2|
|
134
|
-
next if string_2.evaluated
|
135
|
-
similarity = string_1.to_s.similarity(string_2.to_s)
|
136
|
-
scores = string_1.to_s.scores(string_2.to_s).inspect
|
137
|
-
template = "%5.f %s"
|
138
|
-
bindings = [similarity, string_2, scores]
|
139
|
-
if similarity >= configuration.threshold
|
140
|
-
string_2.evaluated = true
|
141
|
-
counter += 1
|
142
|
-
unless configuration.summary
|
143
|
-
puts template % bindings
|
144
|
-
end
|
145
|
-
end
|
146
|
-
end
|
147
|
-
if counter > 0
|
148
|
-
puts "#{counter} #{summary_string}"
|
149
|
-
end
|
195
|
+
sorted_items_in_group.reverse.each_with_index do |record, index|
|
196
|
+
item = Template::Item.new
|
197
|
+
item.count = (index + 1).to_s.rjust(4, ' ').to_ansi.cyan.to_s
|
198
|
+
item.total = records.size.to_s.ljust(4, ' ').to_ansi.cyan.to_s
|
199
|
+
item.score = ('%4.2f%%' % (record[:score] * 100)).rjust(7, ' ').to_ansi.green.to_s
|
200
|
+
item.line = record[:line]
|
201
|
+
puts item.to_s
|
150
202
|
end
|
203
|
+
|
151
204
|
end
|
152
205
|
|