ix-cli 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/ix-string-similarity +161 -108
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9356248fcc4f5fe5c83fedeb01e13166a9764bed70c39fe06132963141f394d9
4
- data.tar.gz: c7676df3c6b0e8427888882bc02fa609e6aa987816f76ef90e656ca6ce196ed7
3
+ metadata.gz: 0c5aadde615cdfa81843168a63e691e29be91832d247cb01c769ac6f0be4cf5f
4
+ data.tar.gz: 178a6b48491c5e5d627390595c60bef6b17c37d150ce02f8ccf59070666b8e2f
5
5
  SHA512:
6
- metadata.gz: 21233826e4b716857231aa89c0abc3263702cf0347b71a2bfc40e3502b79946b98a7b3ae5496bf1975121ef1f3c44952bedd0415d817ac7eee34246ec91007b6
7
- data.tar.gz: fbee1010c9fda5caf7144b8855c6ff8f502413ba3dbee2b81c630340d6c429299878b9b20cc34bc6d9fe2912e3b5e4cee44159f0b31fc947d0467957f7cf6802
6
+ metadata.gz: 34a177748f513b36e85a425f02865b2c7803b1d2c59abce5cd79aae31bf7ff992adffdf3e7d4277c9312c6aa90f209fb662dde35cc2752366de0c6c7c14c90ea
7
+ data.tar.gz: 6b79ddb512d93d28444592dd107076eae1e9c24b0b4da4c9f8c4d8fda5a0be46a69a48d2de0b3afd55bf8f741cc15437137908f736d13c2348581cf43d016c21
@@ -1,152 +1,205 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'optparse'
4
- require 'ostruct'
3
+ class Chunk
4
+ attr_accessor :char_1
5
+ attr_accessor :char_2
6
+ attr_accessor :index_1
7
+ attr_accessor :index_2
8
+
9
+ def initialize(c1, c2, i1, i2)
10
+ @char_1 = c1
11
+ @char_2 = c2
12
+ @index_1 = i1
13
+ @index_2 = i2
14
+ end
5
15
 
6
- configuration = OpenStruct.new
16
+ def to_s
17
+ "#{char_1} #{char_2} #{index_1} #{index_2}"
18
+ end
19
+ end
7
20
 
8
- OptionParser.new do |opts|
21
+ class Similarity
22
+ CHAR_REGEX = /./
9
23
 
10
- opts.banner = "Find similarity in a set of strings."
11
- opts.separator ''
12
- opts.separator "Usage: #{File.basename($0)} [OPTIONS]"
13
- opts.separator ''
24
+ attr_accessor :string_1
25
+ attr_accessor :string_2
14
26
 
15
- configuration.group = false
16
- configuration.threshold = 50
17
- configuration.summary = false
27
+ def initialize(string_1, string_2)
28
+ @string_1 = string_1
29
+ @string_2 = string_2
30
+ end
18
31
 
19
- description = "Group input in batches and process each individually (faster)"
20
- opts.on("-g", "--group", description) do |v|
21
- configuration.group = v
32
+ def tokens
33
+ chunks = []
34
+ string_1.scan(CHAR_REGEX).each_with_index do |char_1, index_1|
35
+ string_2.scan(CHAR_REGEX).each_with_index do |char_2, index_2|
36
+ next if char_1 != char_2
37
+ chunks.push(Chunk.new(char_1, char_2, index_1, index_2))
38
+ end
39
+ end
40
+ chunks
22
41
  end
23
42
 
24
- description = "Limit the number of results by threshold, default is 50"
25
- opts.on("-t", "--threshhold [NUMBER]", OptionParser::DecimalNumeric, description) do |v|
26
- configuration.threshold = v
43
+ def count
44
+ counter = 0
45
+ prev = false
46
+ tokens.each_with_index do |chunk, index|
47
+
48
+ unless prev
49
+ prev = chunk.index_1
50
+ next
51
+ end
52
+
53
+ if prev == (chunk.index_1 - 1)
54
+ counter += 1
55
+ end
56
+
57
+ prev = chunk.index_1
58
+ end
59
+
60
+ counter
27
61
  end
28
62
 
29
- description = "Print a Summary of the groups found"
30
- opts.on("-s", "--summary", description) do |v|
31
- configuration.summary = v
63
+ def score
64
+ desired = (string_1.size + string_2.size) / 2
65
+ size_thresh = ([string_1.size, string_2.size].sort.first.to_f / desired)
66
+ compatibility_thresh = (count.to_f + 1) / string_1.size
67
+ (size_thresh + compatibility_thresh).to_f / 2
32
68
  end
69
+ end
33
70
 
71
+ require 'optparse'
34
72
 
35
- opts.separator ''
73
+ options = {}
74
+ options[:threshold] = 0.8
75
+
76
+ OptionParser.new do |opts|
77
+
78
+ opts.banner = "Usage: #{$0} [OPTIONS]"
79
+
80
+ opts.on('-t', '--threshold [NUMBER]', 'Threshold default value is 0.8.') do |value|
81
+ options[:threshold] = value.to_f
82
+ end
36
83
 
37
84
  end.parse!
38
85
 
39
- # puts configuration.inspect
86
+ required_options = [:threshold]
87
+ required_options.each do |option|
88
+ unless options[option]
89
+ $stderr.puts "Can not run #{option.to_s} was not given."
90
+ exit 1
91
+ end
92
+ end
93
+
94
+ # hash = {
95
+ # 'line' => [
96
+ # { :line => 'line', :score => 1 },
97
+ # ]
98
+ # }
99
+
100
+ hash = {}
101
+ lines = 0
40
102
 
41
- class Array
42
- def product
43
- inject do |cumulative, value|
44
- cumulative += value
103
+ STDIN.each_line do |line|
104
+ line.chomp!
105
+ next if line == ''
106
+ lines += 1
107
+ resolved = false
108
+ hash.keys.each do |registered_line|
109
+ score = Similarity.new(line, registered_line).score
110
+ if score > options[:threshold]
111
+ hash[registered_line].push({
112
+ :line => line,
113
+ :score => score
114
+ })
115
+ resolved = true
45
116
  end
46
117
  end
118
+ next if resolved
119
+ hash[line] ||= []
47
120
  end
48
121
 
49
- class String
50
- def to_a
51
- array = []
52
- size.times do |n|
53
- array << self[n]
122
+ module Template
123
+
124
+ class Banner < Struct.new(:lines, :groups, :threshold, :datetime)
125
+ def to_s
126
+ format(template, to_h)
127
+ end
128
+ def template
129
+ '
130
+ Total Lines Parsed: %<lines>s
131
+ Total Groups Generated: %<groups>s
132
+ Similarity Theshold at: %<threshold>s
133
+ Generated on: %<datetime>s
134
+ '
54
135
  end
55
- array
56
136
  end
57
137
 
58
- def scores(other_string)
59
- longest_string = nil
60
- if other_string.size > self.size
61
- longest_string = other_string
62
- shortest_string = self
63
- else
64
- longest_string = self
65
- shortest_string = other_string
66
- end
67
- scores = longest_string.to_a.map do |char|
68
- 0
138
+ class Group < Struct.new(:number, :percent, :items, :line)
139
+ def to_s
140
+ format(template, to_h)
69
141
  end
70
- shortest_string.size.times do |index|
71
- if shortest_string[index] == longest_string[index]
72
- scores[index] = 1
73
- end
142
+ def template
143
+ 'Group %<number>s represents %<percent>s and has %<items>s items similar to: %<line>s'
74
144
  end
75
- scores
76
145
  end
77
146
 
78
- def similarity(other_string)
79
- scores(other_string).product * 100.0 / size
147
+ class Item < Struct.new(:count, :total, :score, :line)
148
+ def to_s
149
+ format(template, to_h)
150
+ end
151
+ def template
152
+ ' %<count>s/%<total>s %<score>s %<line>s'
153
+ end
80
154
  end
81
- end
82
155
 
83
- class TargetString
84
- attr_accessor :evaluated
85
- attr_accessor :data
86
- def to_s
87
- data
88
- end
89
156
  end
90
157
 
91
- # client
158
+ require 'isna'
92
159
 
93
- strings = []
160
+ banner = Template::Banner.new
161
+ banner.lines = lines.to_s.to_ansi.yellow.to_s
162
+ banner.groups = hash.keys.size.to_s.to_ansi.yellow.to_s
163
+ banner.threshold = options[:threshold].to_s.to_ansi.yellow.to_s
164
+ banner.datetime = Time.now.to_s.to_ansi.yellow.to_s
165
+ puts banner.to_s
94
166
 
95
- STDIN.each_line do |line|
96
- next if line.chomp == ''
97
- strings << line.chomp
98
- end
167
+ groups = []
99
168
 
100
- strings.sort! do |n1, n2|
101
- n1.size <=> n2.size
169
+ hash.each do |category_name, records|
170
+ groups.push([category_name, records.size])
102
171
  end
103
-
104
- strings.reverse!
105
-
106
- strings.map! do |string|
107
- target_string = TargetString.new
108
- target_string.evaluated = false
109
- target_string.data = string
110
- target_string
172
+
173
+ sorted_groups_by_n_records_asc = groups.sort do |array_a, array_b|
174
+ number_of_records_in_a = array_a[1]
175
+ number_of_records_in_b = array_b[1]
176
+ number_of_records_in_a <=> number_of_records_in_b
111
177
  end
112
178
 
113
- if configuration.group
114
- groups = strings.group_by do |string|
115
- string.data.size
179
+ sorted_groups_by_n_records_asc.reverse.each_with_index do |key, index|
180
+ line, records = key[0], hash[key[0]]
181
+
182
+ puts ''
183
+
184
+ group = Template::Group.new
185
+ group.percent = ('%2.2f%%' % ((records.size.to_f / lines) * 100)).to_s.to_ansi.red.to_s
186
+ group.number = (index + 1).to_s.to_ansi.red.to_s
187
+ group.items = records.size.to_s.to_ansi.cyan.to_s
188
+ group.line = line.chomp.to_ansi.green.to_s
189
+ puts group.to_s
190
+
191
+ sorted_items_in_group = records.sort do |a, b|
192
+ a[:score] <=> b[:score]
116
193
  end
117
- else
118
- groups = { 0 => strings }
119
- end
120
194
 
121
- counter = 0
122
- groups.each do |key, group|
123
- group.each do |string_1|
124
- counter = 0
125
- unless string_1.evaluated
126
- if configuration.summary
127
- summary_string = string_1.to_s
128
- else
129
- puts "****>>" + string_1.to_s
130
- end
131
- end
132
- string_1.evaluated = true
133
- group.each do |string_2|
134
- next if string_2.evaluated
135
- similarity = string_1.to_s.similarity(string_2.to_s)
136
- scores = string_1.to_s.scores(string_2.to_s).inspect
137
- template = "%5.f %s"
138
- bindings = [similarity, string_2, scores]
139
- if similarity >= configuration.threshold
140
- string_2.evaluated = true
141
- counter += 1
142
- unless configuration.summary
143
- puts template % bindings
144
- end
145
- end
146
- end
147
- if counter > 0
148
- puts "#{counter} #{summary_string}"
149
- end
195
+ sorted_items_in_group.reverse.each_with_index do |record, index|
196
+ item = Template::Item.new
197
+ item.count = (index + 1).to_s.rjust(4, ' ').to_ansi.cyan.to_s
198
+ item.total = records.size.to_s.ljust(4, ' ').to_ansi.cyan.to_s
199
+ item.score = ('%4.2f%%' % (record[:score] * 100)).rjust(7, ' ').to_ansi.green.to_s
200
+ item.line = record[:line]
201
+ puts item.to_s
150
202
  end
203
+
151
204
  end
152
205
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ix-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kazuyoshi Tlacaelel