inci_score 3.1.3 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +21 -0
- data/README.md +14 -50
- data/bin/console +5 -4
- data/bin/inci_score +2 -2
- data/lib/inci_score/cli.rb +18 -19
- data/lib/inci_score/computer.rb +28 -28
- data/lib/inci_score/config.rb +10 -0
- data/lib/inci_score/ingredient.rb +18 -19
- data/lib/inci_score/levenshtein.rb +12 -9
- data/lib/inci_score/normalizer.rb +8 -6
- data/lib/inci_score/normalizer_rules.rb +13 -28
- data/lib/inci_score/recognizer.rb +15 -10
- data/lib/inci_score/recognizer_rules.rb +11 -11
- data/lib/inci_score/refinements.rb +3 -1
- data/lib/inci_score/response.rb +14 -7
- data/lib/inci_score/score.rb +8 -3
- data/lib/inci_score/scorer.rb +20 -13
- data/lib/inci_score/version.rb +3 -1
- data/lib/inci_score.rb +10 -4
- metadata +15 -52
- data/.gitignore +0 -13
- data/.travis.yml +0 -7
- data/Gemfile +0 -4
- data/Rakefile +0 -24
- data/config/catalog.yml +0 -5018
- data/config.ru +0 -3
- data/ext/levenshtein.c +0 -43
- data/inci_score.gemspec +0 -28
- data/lib/inci_score/api.rb +0 -19
- data/lib/inci_score/catalog.rb +0 -13
- data/lib/inci_score/server.rb +0 -51
- data/log/.gitignore +0 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 65b6b9212d839ed17a978457cc654b835e34e6a7d68dad7c61cbd3f54eaac8cf
|
|
4
|
+
data.tar.gz: ab6f3cebda2cbf8875aa203edc747864f0953559bbf8472f4f757e3a1b505b8b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b51ecb14b357c1926f34ee19d558b5aed024bbda9e5070149abe046f9ef713e40d03749d3ce3d78ee6c1fd48358e9530fb84bed301e7024de18af5466bab5865
|
|
7
|
+
data.tar.gz: 5974307d97068c706bcc483854fd27c6a12e8761d86f3328cfbb1e561cfb06432ddeefcd18aa9d15fee0f3ad8a90260de4919f5a99884119462d93982d361f86
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Commerce Layer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
CHANGED
|
@@ -11,9 +11,6 @@
|
|
|
11
11
|
* [CLI](#cli)
|
|
12
12
|
* [Benchmark](#benchmark)
|
|
13
13
|
* [Levenshtein in C](#levenshtein-in-c)
|
|
14
|
-
* [Platform](#platform)
|
|
15
|
-
* [Wrk](#wrk)
|
|
16
|
-
* [Results](#results)
|
|
17
14
|
|
|
18
15
|
## Scope
|
|
19
16
|
This gem computes the score of cosmetic components basing on the information provided by the [Biodizionario site](http://www.biodizionario.it/) by Fabrizio Zago.
|
|
@@ -70,8 +67,8 @@ In such case the score is computed anyway by considering only recognized compone
|
|
|
70
67
|
Is still possible to query the object for its state:
|
|
71
68
|
|
|
72
69
|
```ruby
|
|
73
|
-
inci = InciScore::Computer.new(src: 'ingredients:aqua,noent1,noent2')
|
|
74
|
-
inci.valid # false
|
|
70
|
+
inci = InciScore::Computer.new(src: 'ingredients:aqua,noent1,noent2')
|
|
71
|
+
inci.valid? # false
|
|
75
72
|
inci.unrecognized # ["noent1", "noent2"]
|
|
76
73
|
```
|
|
77
74
|
|
|
@@ -82,32 +79,15 @@ You can collect INCI data by using the available CLI interface:
|
|
|
82
79
|
inci_score --src="ingredients: aqua, dimethicone, pej-10, noent"
|
|
83
80
|
|
|
84
81
|
TOTAL SCORE:
|
|
85
|
-
|
|
82
|
+
47.18
|
|
86
83
|
VALID STATE:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
true
|
|
85
|
+
PRECISION:
|
|
86
|
+
75.0
|
|
87
|
+
COMPONENTS:
|
|
88
|
+
aqua\n dimethicone\n peg-10
|
|
92
89
|
UNRECOGNIZED:
|
|
93
|
-
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
#### HTTP server
|
|
97
|
-
The CLI interface exposes a Web layer based on the [Puma](http://puma.io/) application server.
|
|
98
|
-
The HTTP server is started on the specified port by spawning as many workers as your current workstation supports:
|
|
99
|
-
```shell
|
|
100
|
-
inci_score --http=9292
|
|
101
|
-
```
|
|
102
|
-
Consider all other options are discarded when running HTTP server.
|
|
103
|
-
|
|
104
|
-
##### Triggering a request
|
|
105
|
-
The HTTP server responds with a JSON representation of the original *InciScore::Response* object.
|
|
106
|
-
You can pass the source string directly as a HTTP parameter (URI escaped):
|
|
107
|
-
|
|
108
|
-
```shell
|
|
109
|
-
curl http://127.0.0.1:9292?src=aqua,dimethicone
|
|
110
|
-
=> {"components":{"aqua":0,"dimethicone":4},"unrecognized":[],"score":53.7629,"valid":true}
|
|
90
|
+
noent
|
|
111
91
|
```
|
|
112
92
|
|
|
113
93
|
#### Getting help
|
|
@@ -115,7 +95,6 @@ You can get CLI interface help by:
|
|
|
115
95
|
```shell
|
|
116
96
|
Usage: inci_score --src="aqua, parfum, etc"
|
|
117
97
|
-s, --src=SRC The INCI list: "aqua, parfum, etc"
|
|
118
|
-
--http=PORT Start HTTP server on the specified port
|
|
119
98
|
-h, --help Prints this help
|
|
120
99
|
```
|
|
121
100
|
|
|
@@ -124,25 +103,10 @@ Usage: inci_score --src="aqua, parfum, etc"
|
|
|
124
103
|
### Levenshtein in C
|
|
125
104
|
I noticed the APIs slows down dramatically when dealing with unrecognized components to fuzzy match on.
|
|
126
105
|
I profiled the code by using the [benchmark-ips](https://github.com/evanphx/benchmark-ips) gem, finding the bottleneck was the pure Ruby implementation of the Levenshtein distance algorithm.
|
|
127
|
-
After some pointless optimization, i replaced this routine with a C implementation: i opted for the straightforward [Ruby Inline](https://github.com/seattlerb/rubyinline) library to call the C code straight from Ruby.
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
* OSX Sierra
|
|
132
|
-
* 2,2 GHz Intel Core i7 (4 cores)
|
|
133
|
-
* 16 GB 1600 MHz DDR3
|
|
134
|
-
* Ruby 2.4
|
|
135
|
-
|
|
136
|
-
### Wrk
|
|
137
|
-
As always i used [wrk](https://github.com/wg/wrk) as the loading tool.
|
|
138
|
-
I measured the library three times, picking the best lap.
|
|
106
|
+
After some pointless optimization, i replaced this routine with a C implementation: i opted for the straightforward [Ruby Inline](https://github.com/seattlerb/rubyinline) library to call the C code straight from Ruby.
|
|
107
|
+
|
|
108
|
+
Once downloaded source code, run the bench specs by:
|
|
109
|
+
|
|
139
110
|
```shell
|
|
140
|
-
|
|
111
|
+
bundle exec rake spec:bench
|
|
141
112
|
```
|
|
142
|
-
|
|
143
|
-
### Results
|
|
144
|
-
| Source | Throughput (req/s) |
|
|
145
|
-
| --------------------------: | -----------------: |
|
|
146
|
-
| aqua,parfum,zeolite | 20296.75 |
|
|
147
|
-
| agua,porfum,zeolithe | 1098.45 |
|
|
148
|
-
| agua/water,porfum/fragrance | 1599.47 |
|
data/bin/console
CHANGED
data/bin/inci_score
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
-
lib = File.expand_path(
|
|
2
|
+
lib = File.expand_path('../../lib', __FILE__)
|
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
|
5
|
-
require
|
|
5
|
+
require 'inci_score'
|
|
6
6
|
|
|
7
7
|
InciScore::CLI.new(args: ARGV.clone).call
|
data/lib/inci_score/cli.rb
CHANGED
|
@@ -1,38 +1,37 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'inci_score/computer'
|
|
4
5
|
|
|
5
6
|
module InciScore
|
|
6
7
|
class CLI
|
|
7
|
-
|
|
8
|
+
attr_reader :args, :io, :catalog
|
|
9
|
+
attr_accessor :src
|
|
10
|
+
|
|
11
|
+
def initialize(args:, io: STDOUT, catalog: Config::CATALOG)
|
|
8
12
|
@args = args
|
|
9
13
|
@io = io
|
|
10
14
|
@catalog = catalog
|
|
11
15
|
@src = nil
|
|
12
|
-
@port = nil
|
|
13
16
|
end
|
|
14
17
|
|
|
15
|
-
def call
|
|
16
|
-
parser.parse!(
|
|
17
|
-
return
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
def call
|
|
19
|
+
parser.parse!(args)
|
|
20
|
+
return io.puts(%q{Specify inci list as: --src='aqua, parfum, etc'}) unless src
|
|
21
|
+
computer = Computer.new(src: src, catalog: catalog)
|
|
22
|
+
io.puts computer.call
|
|
20
23
|
end
|
|
21
24
|
|
|
22
25
|
private def parser
|
|
23
26
|
OptionParser.new do |opts|
|
|
24
|
-
opts.banner = %q{Usage: inci_score --src=
|
|
25
|
-
|
|
26
|
-
opts.on("-sSRC", "--src=SRC", %q{The INCI list: "aqua, parfum, etc"}) do |src|
|
|
27
|
-
@src = src
|
|
28
|
-
end
|
|
27
|
+
opts.banner = %q{Usage: inci_score --src='aqua, parfum, etc'}
|
|
29
28
|
|
|
30
|
-
opts.on(
|
|
31
|
-
|
|
29
|
+
opts.on('-sSRC', '--src=SRC', %q{The INCI list: 'aqua, parfum, etc'}) do |src|
|
|
30
|
+
self.src = src
|
|
32
31
|
end
|
|
33
32
|
|
|
34
|
-
opts.on(
|
|
35
|
-
|
|
33
|
+
opts.on('-h', '--help', 'Prints this help') do
|
|
34
|
+
io.puts opts
|
|
36
35
|
exit
|
|
37
36
|
end
|
|
38
37
|
end
|
data/lib/inci_score/computer.rb
CHANGED
|
@@ -1,50 +1,50 @@
|
|
|
1
|
-
|
|
2
|
-
require "inci_score/normalizer"
|
|
3
|
-
require "inci_score/recognizer"
|
|
4
|
-
require "inci_score/response"
|
|
5
|
-
require "inci_score/scorer"
|
|
1
|
+
# frozen_string_literal: true
|
|
6
2
|
|
|
7
3
|
module InciScore
|
|
8
4
|
class Computer
|
|
9
5
|
TOLERANCE = 30.0
|
|
10
|
-
|
|
6
|
+
DECIMALS = 2
|
|
11
7
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
8
|
+
attr_reader :src, :catalog, :rules, :ingredients, :components, :unrecognized
|
|
9
|
+
|
|
10
|
+
def initialize(src:, catalog: Config::CATALOG, rules: Normalizer::DEFAULT_RULES)
|
|
11
|
+
@unrecognized = []
|
|
16
12
|
@src = src
|
|
17
13
|
@catalog = catalog
|
|
18
|
-
@tolerance = Float(tolerance)
|
|
19
14
|
@rules = rules
|
|
20
|
-
@
|
|
15
|
+
@ingredients = Normalizer.new(src: src, rules: rules).call
|
|
16
|
+
@components = fetch_components
|
|
17
|
+
freeze
|
|
21
18
|
end
|
|
22
19
|
|
|
23
20
|
def call
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
21
|
+
Response.new(components: components.map(&:name),
|
|
22
|
+
unrecognized: unrecognized,
|
|
23
|
+
score: score,
|
|
24
|
+
valid: valid?,
|
|
25
|
+
precision: precision)
|
|
28
26
|
end
|
|
29
27
|
|
|
30
|
-
|
|
31
|
-
Scorer.new(components.map(&:hazard)).call
|
|
28
|
+
def score
|
|
29
|
+
Scorer.new(components.map(&:hazard)).call.round(DECIMALS)
|
|
32
30
|
end
|
|
33
31
|
|
|
34
|
-
|
|
35
|
-
|
|
32
|
+
def precision
|
|
33
|
+
(100 - ((unrecognized.size / Float(ingredients.size)) * 100)).round(DECIMALS)
|
|
36
34
|
end
|
|
37
35
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
Recognizer.new(ingredient, @catalog).call.tap do |component|
|
|
41
|
-
@unrecognized << ingredient unless component
|
|
42
|
-
end
|
|
43
|
-
end.compact
|
|
36
|
+
def valid?
|
|
37
|
+
precision >= TOLERANCE
|
|
44
38
|
end
|
|
45
39
|
|
|
46
|
-
private
|
|
47
|
-
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def fetch_components
|
|
43
|
+
ingredients.map do |ingredient|
|
|
44
|
+
Recognizer.new(ingredient, catalog).call.tap do |component|
|
|
45
|
+
unrecognized << ingredient unless component
|
|
46
|
+
end
|
|
47
|
+
end.compact
|
|
48
48
|
end
|
|
49
49
|
end
|
|
50
50
|
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
|
|
5
|
+
module InciScore
|
|
6
|
+
module Config
|
|
7
|
+
CATALOG = YAML::load_file(File::expand_path('../../../config/catalog.yml', __FILE__)).freeze
|
|
8
|
+
HAZARDS = YAML::load_file(File::expand_path('../../../config/hazards.yml', __FILE__)).freeze
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -1,40 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module InciScore
|
|
2
4
|
class Ingredient
|
|
3
|
-
SLASH =
|
|
4
|
-
SLASH_RULE = /(?<!ate)
|
|
5
|
-
PARENTHESIS = %w[( ) [ ]]
|
|
6
|
-
DETAILS_RULE = /(\(.+\)|\[.+\])
|
|
5
|
+
SLASH = '/'
|
|
6
|
+
SLASH_RULE = /(?<!ate)\//.freeze
|
|
7
|
+
PARENTHESIS = %w[( ) [ ]].freeze
|
|
8
|
+
DETAILS_RULE = /(\(.+\)|\[.+\])/.freeze
|
|
9
|
+
|
|
10
|
+
attr_reader :raw, :tokens, :values
|
|
7
11
|
|
|
8
12
|
def initialize(raw)
|
|
9
13
|
@raw = raw.to_s
|
|
10
14
|
@tokens = @raw.split(SLASH_RULE).map(&:strip)
|
|
15
|
+
@values ||= synonims.unshift(name).compact
|
|
16
|
+
freeze
|
|
11
17
|
end
|
|
12
18
|
|
|
13
19
|
def to_s
|
|
14
20
|
values.join(SLASH)
|
|
15
21
|
end
|
|
16
22
|
|
|
17
|
-
|
|
18
|
-
@values ||= synonims.unshift(name).compact
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
private def name
|
|
22
|
-
return @tokens.first unless parenthesis?
|
|
23
|
-
@raw.sub(DETAILS_RULE, "").strip
|
|
24
|
-
end
|
|
23
|
+
private
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
def name
|
|
26
|
+
return tokens.first unless parenthesis?
|
|
27
|
+
raw.sub(DETAILS_RULE, '').strip
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@raw.match(DETAILS_RULE)[1].delete(PARENTHESIS.join("|"))
|
|
30
|
+
def synonims
|
|
31
|
+
tokens[1, tokens.size].to_a
|
|
33
32
|
end
|
|
34
33
|
|
|
35
|
-
|
|
34
|
+
def parenthesis?
|
|
36
35
|
PARENTHESIS.each_slice(2).any? do |pair|
|
|
37
|
-
pair.all? { |p|
|
|
36
|
+
pair.all? { |p| raw.index(p) }
|
|
38
37
|
end
|
|
39
38
|
end
|
|
40
39
|
end
|
|
@@ -1,24 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'inline'
|
|
2
4
|
|
|
3
5
|
module InciScore
|
|
4
6
|
class LevenshteinC
|
|
5
|
-
C_PROGRAM = File::expand_path(
|
|
7
|
+
C_PROGRAM = File::expand_path('../../../ext/levenshtein.c', __FILE__)
|
|
6
8
|
|
|
7
9
|
inline(:C) do |builder|
|
|
8
|
-
builder.c File::read(C_PROGRAM)
|
|
10
|
+
builder.c File::read(C_PROGRAM)
|
|
9
11
|
end
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
class Levenshtein
|
|
15
|
+
attr_reader :s, :t
|
|
16
|
+
|
|
13
17
|
def initialize(s, t)
|
|
14
|
-
@s = s.downcase.unpack(
|
|
15
|
-
@t = t.downcase.unpack(
|
|
18
|
+
@s = s.downcase.unpack('U*')
|
|
19
|
+
@t = t.downcase.unpack('U*')
|
|
16
20
|
end
|
|
17
21
|
|
|
18
22
|
def call
|
|
19
|
-
n, m =
|
|
23
|
+
n, m = s.length, t.length
|
|
20
24
|
|
|
21
|
-
return 0 if
|
|
25
|
+
return 0 if s == t
|
|
22
26
|
return m if n.zero?
|
|
23
27
|
return n if m.zero?
|
|
24
28
|
|
|
@@ -28,7 +32,7 @@ module InciScore
|
|
|
28
32
|
n.times do |i|
|
|
29
33
|
e = i + 1
|
|
30
34
|
m.times do |j|
|
|
31
|
-
c =
|
|
35
|
+
c = s[i] == t[j] ? 0 : 1
|
|
32
36
|
ins = d[j + 1] + 1
|
|
33
37
|
del = e + 1
|
|
34
38
|
sub = d[j] + c
|
|
@@ -43,4 +47,3 @@ module InciScore
|
|
|
43
47
|
end
|
|
44
48
|
end
|
|
45
49
|
end
|
|
46
|
-
|
|
@@ -1,20 +1,22 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'inci_score/normalizer_rules'
|
|
2
4
|
|
|
3
5
|
module InciScore
|
|
4
6
|
class Normalizer
|
|
5
|
-
DEFAULT_RULES = [Rules::Replacer, Rules::Downcaser, Rules::Beheader, Rules::Separator, Rules::Tokenizer, Rules::Sanitizer, Rules::Uniquifier]
|
|
7
|
+
DEFAULT_RULES = [Rules::Replacer, Rules::Downcaser, Rules::Beheader, Rules::Separator, Rules::Tokenizer, Rules::Sanitizer, Rules::Uniquifier].freeze
|
|
6
8
|
|
|
7
|
-
attr_reader :src
|
|
9
|
+
attr_reader :src, :rules
|
|
8
10
|
|
|
9
11
|
def initialize(src:, rules: DEFAULT_RULES)
|
|
10
12
|
@src = src
|
|
11
13
|
@rules = rules
|
|
14
|
+
freeze
|
|
12
15
|
end
|
|
13
16
|
|
|
14
17
|
def call
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@src = rule.call(src)
|
|
18
|
+
rules.reduce(src) do |_src, rule|
|
|
19
|
+
_src = rule.call(_src)
|
|
18
20
|
end
|
|
19
21
|
end
|
|
20
22
|
end
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module InciScore
|
|
2
4
|
class Normalizer
|
|
3
5
|
module Rules
|
|
4
6
|
SEPARATOR = ','
|
|
5
7
|
|
|
8
|
+
Downcaser = ->(src) { src.downcase }.freeze
|
|
9
|
+
|
|
10
|
+
Tokenizer = ->(src) { src.split(SEPARATOR).map(&:strip) }.freeze
|
|
11
|
+
|
|
12
|
+
Uniquifier = ->(src) { Array(src).uniq }.freeze
|
|
13
|
+
|
|
6
14
|
module Replacer
|
|
7
15
|
extend self
|
|
8
16
|
|
|
@@ -14,7 +22,7 @@ module InciScore
|
|
|
14
22
|
['~', '-'],
|
|
15
23
|
['|', 'l'],
|
|
16
24
|
[' I ', '/']
|
|
17
|
-
]
|
|
25
|
+
].freeze
|
|
18
26
|
|
|
19
27
|
def call(src)
|
|
20
28
|
REPLACEMENTS.reduce(src) do |_src, replacement|
|
|
@@ -24,14 +32,6 @@ module InciScore
|
|
|
24
32
|
end
|
|
25
33
|
end
|
|
26
34
|
|
|
27
|
-
module Downcaser
|
|
28
|
-
extend self
|
|
29
|
-
|
|
30
|
-
def call(src)
|
|
31
|
-
src.downcase
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
-
|
|
35
35
|
module Beheader
|
|
36
36
|
extend self
|
|
37
37
|
|
|
@@ -40,7 +40,8 @@ module InciScore
|
|
|
40
40
|
|
|
41
41
|
def call(src)
|
|
42
42
|
sep_index = src.index(TITLE_SEP)
|
|
43
|
-
return src
|
|
43
|
+
return src unless sep_index
|
|
44
|
+
return src if sep_index > MAX_INDEX
|
|
44
45
|
src[sep_index+1, src.size]
|
|
45
46
|
end
|
|
46
47
|
end
|
|
@@ -48,27 +49,19 @@ module InciScore
|
|
|
48
49
|
module Separator
|
|
49
50
|
extend self
|
|
50
51
|
|
|
51
|
-
SEPARATORS = [
|
|
52
|
+
SEPARATORS = ['; ', '. ', " ' ", ' - ', ' : '].freeze
|
|
52
53
|
|
|
53
54
|
def call(src)
|
|
54
55
|
SEPARATORS.reduce(src) do |_src, separator|
|
|
55
56
|
_src = _src.gsub(separator, SEPARATOR)
|
|
56
57
|
end
|
|
57
58
|
end
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
module Tokenizer
|
|
61
|
-
extend self
|
|
62
|
-
|
|
63
|
-
def call(src)
|
|
64
|
-
src.split(SEPARATOR).map(&:strip)
|
|
65
|
-
end
|
|
66
59
|
end
|
|
67
60
|
|
|
68
61
|
module Sanitizer
|
|
69
62
|
extend self
|
|
70
63
|
|
|
71
|
-
INVALID_CHARS = /[^\/\[\]\(\)\w\s-]
|
|
64
|
+
INVALID_CHARS = /[^\/\[\]\(\)\w\s-]/.freeze
|
|
72
65
|
|
|
73
66
|
def call(src)
|
|
74
67
|
Array(src).map do |token|
|
|
@@ -76,14 +69,6 @@ module InciScore
|
|
|
76
69
|
end.reject(&:empty?)
|
|
77
70
|
end
|
|
78
71
|
end
|
|
79
|
-
|
|
80
|
-
module Uniquifier
|
|
81
|
-
extend self
|
|
82
|
-
|
|
83
|
-
def call(src)
|
|
84
|
-
Array(src).uniq
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
72
|
end
|
|
88
73
|
end
|
|
89
74
|
end
|
|
@@ -1,38 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "inci_score/recognizer_rules"
|
|
2
4
|
|
|
3
5
|
module InciScore
|
|
4
6
|
class Recognizer
|
|
5
|
-
DEFAULT_RULES = [Rules::Key, Rules::Levenshtein, Rules::Digits, Rules::Tokens]
|
|
7
|
+
DEFAULT_RULES = [Rules::Key, Rules::Levenshtein, Rules::Digits, Rules::Hazard, Rules::Tokens].freeze
|
|
6
8
|
|
|
7
9
|
Component = Struct.new(:name, :hazard)
|
|
8
10
|
|
|
9
|
-
attr_reader :applied
|
|
11
|
+
attr_reader :ingredient, :catalog, :rules, :applied
|
|
10
12
|
|
|
11
13
|
def initialize(ingredient, catalog, rules = DEFAULT_RULES, wrapper = Ingredient)
|
|
12
14
|
@ingredient = wrapper.new(ingredient)
|
|
13
15
|
@catalog = catalog
|
|
14
16
|
@rules = rules
|
|
15
17
|
@applied = []
|
|
18
|
+
freeze
|
|
16
19
|
end
|
|
17
20
|
|
|
18
21
|
def call
|
|
19
|
-
return if
|
|
22
|
+
return if ingredient.to_s.empty?
|
|
20
23
|
component = find_component
|
|
21
24
|
return unless component
|
|
22
|
-
Component.new(component,
|
|
23
|
-
end
|
|
25
|
+
Component.new(component, catalog[component])
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
def find_component
|
|
31
|
+
rules.reduce(nil) do |component, rule|
|
|
27
32
|
break(component) if component
|
|
28
33
|
applied << rule
|
|
29
34
|
apply(rule)
|
|
30
35
|
end
|
|
31
36
|
end
|
|
32
37
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
rule.call(value,
|
|
38
|
+
def apply(rule)
|
|
39
|
+
ingredient.values.map do |value|
|
|
40
|
+
rule.call(value, catalog)
|
|
36
41
|
end.find(&:itself)
|
|
37
42
|
end
|
|
38
43
|
end
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'inci_score/refinements'
|
|
2
4
|
|
|
3
5
|
module InciScore
|
|
4
6
|
using Refinements
|
|
@@ -6,13 +8,9 @@ module InciScore
|
|
|
6
8
|
module Rules
|
|
7
9
|
TOLERANCE = 3
|
|
8
10
|
|
|
9
|
-
|
|
10
|
-
extend self
|
|
11
|
+
Key = ->(src, catalog) { src if catalog.has_key?(src) }
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
src if catalog.has_key?(src)
|
|
14
|
-
end
|
|
15
|
-
end
|
|
13
|
+
Hazard = ->(src, _) { 'generic-hazard' if Config::HAZARDS.any? { |h| src.include?(h) } }
|
|
16
14
|
|
|
17
15
|
module Levenshtein
|
|
18
16
|
extend self
|
|
@@ -54,19 +52,21 @@ module InciScore
|
|
|
54
52
|
module Tokens
|
|
55
53
|
extend self
|
|
56
54
|
|
|
57
|
-
UNMATCHABLE = %w[extract oil sodium acid sulfate]
|
|
58
|
-
|
|
55
|
+
UNMATCHABLE = %w[extract oil sodium acid sulfate].freeze
|
|
56
|
+
|
|
59
57
|
def call(src, catalog)
|
|
60
58
|
tokens(src).each do |token|
|
|
61
|
-
catalog.each do |component, _|
|
|
59
|
+
catalog.each do |component, _|
|
|
62
60
|
return component if component.include?(token)
|
|
63
61
|
end
|
|
64
62
|
end
|
|
65
63
|
nil
|
|
66
64
|
end
|
|
67
65
|
|
|
66
|
+
private
|
|
67
|
+
|
|
68
68
|
def tokens(src)
|
|
69
|
-
(src.split(
|
|
69
|
+
(src.split(' ') - UNMATCHABLE).reject { |t| t.size < TOLERANCE }.sort! { |a, b| b.size <=> a.size }
|
|
70
70
|
end
|
|
71
71
|
end
|
|
72
72
|
end
|