ix-cli 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ix-string-similarity +161 -108
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c5aadde615cdfa81843168a63e691e29be91832d247cb01c769ac6f0be4cf5f
|
4
|
+
data.tar.gz: 178a6b48491c5e5d627390595c60bef6b17c37d150ce02f8ccf59070666b8e2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34a177748f513b36e85a425f02865b2c7803b1d2c59abce5cd79aae31bf7ff992adffdf3e7d4277c9312c6aa90f209fb662dde35cc2752366de0c6c7c14c90ea
|
7
|
+
data.tar.gz: 6b79ddb512d93d28444592dd107076eae1e9c24b0b4da4c9f8c4d8fda5a0be46a69a48d2de0b3afd55bf8f741cc15437137908f736d13c2348581cf43d016c21
|
data/bin/ix-string-similarity
CHANGED
@@ -1,152 +1,205 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
class Chunk
|
4
|
+
attr_accessor :char_1
|
5
|
+
attr_accessor :char_2
|
6
|
+
attr_accessor :index_1
|
7
|
+
attr_accessor :index_2
|
8
|
+
|
9
|
+
def initialize(c1, c2, i1, i2)
|
10
|
+
@char_1 = c1
|
11
|
+
@char_2 = c2
|
12
|
+
@index_1 = i1
|
13
|
+
@index_2 = i2
|
14
|
+
end
|
5
15
|
|
6
|
-
|
16
|
+
def to_s
|
17
|
+
"#{char_1} #{char_2} #{index_1} #{index_2}"
|
18
|
+
end
|
19
|
+
end
|
7
20
|
|
8
|
-
|
21
|
+
class Similarity
|
22
|
+
CHAR_REGEX = /./
|
9
23
|
|
10
|
-
|
11
|
-
|
12
|
-
opts.separator "Usage: #{File.basename($0)} [OPTIONS]"
|
13
|
-
opts.separator ''
|
24
|
+
attr_accessor :string_1
|
25
|
+
attr_accessor :string_2
|
14
26
|
|
15
|
-
|
16
|
-
|
17
|
-
|
27
|
+
def initialize(string_1, string_2)
|
28
|
+
@string_1 = string_1
|
29
|
+
@string_2 = string_2
|
30
|
+
end
|
18
31
|
|
19
|
-
|
20
|
-
|
21
|
-
|
32
|
+
def tokens
|
33
|
+
chunks = []
|
34
|
+
string_1.scan(CHAR_REGEX).each_with_index do |char_1, index_1|
|
35
|
+
string_2.scan(CHAR_REGEX).each_with_index do |char_2, index_2|
|
36
|
+
next if char_1 != char_2
|
37
|
+
chunks.push(Chunk.new(char_1, char_2, index_1, index_2))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
chunks
|
22
41
|
end
|
23
42
|
|
24
|
-
|
25
|
-
|
26
|
-
|
43
|
+
def count
|
44
|
+
counter = 0
|
45
|
+
prev = false
|
46
|
+
tokens.each_with_index do |chunk, index|
|
47
|
+
|
48
|
+
unless prev
|
49
|
+
prev = chunk.index_1
|
50
|
+
next
|
51
|
+
end
|
52
|
+
|
53
|
+
if prev == (chunk.index_1 - 1)
|
54
|
+
counter += 1
|
55
|
+
end
|
56
|
+
|
57
|
+
prev = chunk.index_1
|
58
|
+
end
|
59
|
+
|
60
|
+
counter
|
27
61
|
end
|
28
62
|
|
29
|
-
|
30
|
-
|
31
|
-
|
63
|
+
def score
|
64
|
+
desired = (string_1.size + string_2.size) / 2
|
65
|
+
size_thresh = ([string_1.size, string_2.size].sort.first.to_f / desired)
|
66
|
+
compatibility_thresh = (count.to_f + 1) / string_1.size
|
67
|
+
(size_thresh + compatibility_thresh).to_f / 2
|
32
68
|
end
|
69
|
+
end
|
33
70
|
|
71
|
+
require 'optparse'
|
34
72
|
|
35
|
-
|
73
|
+
options = {}
|
74
|
+
options[:threshold] = 0.8
|
75
|
+
|
76
|
+
OptionParser.new do |opts|
|
77
|
+
|
78
|
+
opts.banner = "Usage: #{$0} [OPTIONS]"
|
79
|
+
|
80
|
+
opts.on('-t', '--threshold [NUMBER]', 'Threshold default value is 0.8.') do |value|
|
81
|
+
options[:threshold] = value.to_f
|
82
|
+
end
|
36
83
|
|
37
84
|
end.parse!
|
38
85
|
|
39
|
-
|
86
|
+
required_options = [:threshold]
|
87
|
+
required_options.each do |option|
|
88
|
+
unless options[option]
|
89
|
+
$stderr.puts "Can not run #{option.to_s} was not given."
|
90
|
+
exit 1
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# hash = {
|
95
|
+
# 'line' => [
|
96
|
+
# { :line => 'line', :score => 1 },
|
97
|
+
# ]
|
98
|
+
# }
|
99
|
+
|
100
|
+
hash = {}
|
101
|
+
lines = 0
|
40
102
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
103
|
+
STDIN.each_line do |line|
|
104
|
+
line.chomp!
|
105
|
+
next if line == ''
|
106
|
+
lines += 1
|
107
|
+
resolved = false
|
108
|
+
hash.keys.each do |registered_line|
|
109
|
+
score = Similarity.new(line, registered_line).score
|
110
|
+
if score > options[:threshold]
|
111
|
+
hash[registered_line].push({
|
112
|
+
:line => line,
|
113
|
+
:score => score
|
114
|
+
})
|
115
|
+
resolved = true
|
45
116
|
end
|
46
117
|
end
|
118
|
+
next if resolved
|
119
|
+
hash[line] ||= []
|
47
120
|
end
|
48
121
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
122
|
+
module Template
|
123
|
+
|
124
|
+
class Banner < Struct.new(:lines, :groups, :threshold, :datetime)
|
125
|
+
def to_s
|
126
|
+
format(template, to_h)
|
127
|
+
end
|
128
|
+
def template
|
129
|
+
'
|
130
|
+
Total Lines Parsed: %<lines>s
|
131
|
+
Total Groups Generated: %<groups>s
|
132
|
+
Similarity Theshold at: %<threshold>s
|
133
|
+
Generated on: %<datetime>s
|
134
|
+
'
|
54
135
|
end
|
55
|
-
array
|
56
136
|
end
|
57
137
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
longest_string = other_string
|
62
|
-
shortest_string = self
|
63
|
-
else
|
64
|
-
longest_string = self
|
65
|
-
shortest_string = other_string
|
66
|
-
end
|
67
|
-
scores = longest_string.to_a.map do |char|
|
68
|
-
0
|
138
|
+
class Group < Struct.new(:number, :percent, :items, :line)
|
139
|
+
def to_s
|
140
|
+
format(template, to_h)
|
69
141
|
end
|
70
|
-
|
71
|
-
|
72
|
-
scores[index] = 1
|
73
|
-
end
|
142
|
+
def template
|
143
|
+
'Group %<number>s represents %<percent>s and has %<items>s items similar to: %<line>s'
|
74
144
|
end
|
75
|
-
scores
|
76
145
|
end
|
77
146
|
|
78
|
-
|
79
|
-
|
147
|
+
class Item < Struct.new(:count, :total, :score, :line)
|
148
|
+
def to_s
|
149
|
+
format(template, to_h)
|
150
|
+
end
|
151
|
+
def template
|
152
|
+
' %<count>s/%<total>s %<score>s %<line>s'
|
153
|
+
end
|
80
154
|
end
|
81
|
-
end
|
82
155
|
|
83
|
-
class TargetString
|
84
|
-
attr_accessor :evaluated
|
85
|
-
attr_accessor :data
|
86
|
-
def to_s
|
87
|
-
data
|
88
|
-
end
|
89
156
|
end
|
90
157
|
|
91
|
-
|
158
|
+
require 'isna'
|
92
159
|
|
93
|
-
|
160
|
+
banner = Template::Banner.new
|
161
|
+
banner.lines = lines.to_s.to_ansi.yellow.to_s
|
162
|
+
banner.groups = hash.keys.size.to_s.to_ansi.yellow.to_s
|
163
|
+
banner.threshold = options[:threshold].to_s.to_ansi.yellow.to_s
|
164
|
+
banner.datetime = Time.now.to_s.to_ansi.yellow.to_s
|
165
|
+
puts banner.to_s
|
94
166
|
|
95
|
-
|
96
|
-
next if line.chomp == ''
|
97
|
-
strings << line.chomp
|
98
|
-
end
|
167
|
+
groups = []
|
99
168
|
|
100
|
-
|
101
|
-
|
169
|
+
hash.each do |category_name, records|
|
170
|
+
groups.push([category_name, records.size])
|
102
171
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
target_string.evaluated = false
|
109
|
-
target_string.data = string
|
110
|
-
target_string
|
172
|
+
|
173
|
+
sorted_groups_by_n_records_asc = groups.sort do |array_a, array_b|
|
174
|
+
number_of_records_in_a = array_a[1]
|
175
|
+
number_of_records_in_b = array_b[1]
|
176
|
+
number_of_records_in_a <=> number_of_records_in_b
|
111
177
|
end
|
112
178
|
|
113
|
-
|
114
|
-
|
115
|
-
|
179
|
+
sorted_groups_by_n_records_asc.reverse.each_with_index do |key, index|
|
180
|
+
line, records = key[0], hash[key[0]]
|
181
|
+
|
182
|
+
puts ''
|
183
|
+
|
184
|
+
group = Template::Group.new
|
185
|
+
group.percent = ('%2.2f%%' % ((records.size.to_f / lines) * 100)).to_s.to_ansi.red.to_s
|
186
|
+
group.number = (index + 1).to_s.to_ansi.red.to_s
|
187
|
+
group.items = records.size.to_s.to_ansi.cyan.to_s
|
188
|
+
group.line = line.chomp.to_ansi.green.to_s
|
189
|
+
puts group.to_s
|
190
|
+
|
191
|
+
sorted_items_in_group = records.sort do |a, b|
|
192
|
+
a[:score] <=> b[:score]
|
116
193
|
end
|
117
|
-
else
|
118
|
-
groups = { 0 => strings }
|
119
|
-
end
|
120
194
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
else
|
129
|
-
puts "****>>" + string_1.to_s
|
130
|
-
end
|
131
|
-
end
|
132
|
-
string_1.evaluated = true
|
133
|
-
group.each do |string_2|
|
134
|
-
next if string_2.evaluated
|
135
|
-
similarity = string_1.to_s.similarity(string_2.to_s)
|
136
|
-
scores = string_1.to_s.scores(string_2.to_s).inspect
|
137
|
-
template = "%5.f %s"
|
138
|
-
bindings = [similarity, string_2, scores]
|
139
|
-
if similarity >= configuration.threshold
|
140
|
-
string_2.evaluated = true
|
141
|
-
counter += 1
|
142
|
-
unless configuration.summary
|
143
|
-
puts template % bindings
|
144
|
-
end
|
145
|
-
end
|
146
|
-
end
|
147
|
-
if counter > 0
|
148
|
-
puts "#{counter} #{summary_string}"
|
149
|
-
end
|
195
|
+
sorted_items_in_group.reverse.each_with_index do |record, index|
|
196
|
+
item = Template::Item.new
|
197
|
+
item.count = (index + 1).to_s.rjust(4, ' ').to_ansi.cyan.to_s
|
198
|
+
item.total = records.size.to_s.ljust(4, ' ').to_ansi.cyan.to_s
|
199
|
+
item.score = ('%4.2f%%' % (record[:score] * 100)).rjust(7, ' ').to_ansi.green.to_s
|
200
|
+
item.line = record[:line]
|
201
|
+
puts item.to_s
|
150
202
|
end
|
203
|
+
|
151
204
|
end
|
152
205
|
|