inci_score 2.5.1 → 3.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +34 -0
- data/README.md +14 -25
- data/config/catalog.yml +3 -4
- data/config.ru +2 -2
- data/inci_score.gemspec +2 -3
- data/lib/inci_score/{app.rb → api.rb} +1 -1
- data/lib/inci_score/cli.rb +4 -14
- data/lib/inci_score/computer.rb +15 -6
- data/lib/inci_score/ingredient.rb +45 -0
- data/lib/inci_score/normalizer.rb +1 -1
- data/lib/inci_score/normalizer_rules.rb +0 -24
- data/lib/inci_score/recognizer.rb +25 -7
- data/lib/inci_score/recognizer_rules.rb +19 -22
- data/lib/inci_score/refinements.rb +1 -1
- data/lib/inci_score/response.rb +1 -1
- data/lib/inci_score/score.rb +1 -3
- data/lib/inci_score/scorer.rb +7 -9
- data/lib/inci_score/version.rb +1 -1
- data/lib/inci_score.rb +4 -5
- data/spec/bench/levenshtein_bench.rb +17 -0
- data/spec/bench/normalizer_rules_bench.rb +40 -0
- data/spec/bench/recognizer_rules_bench.rb +24 -0
- data/spec/helper.rb +6 -0
- data/spec/integration/api_spec.rb +23 -0
- data/spec/stubs.rb +170 -0
- data/spec/unit/catalog_spec.rb +7 -0
- data/spec/unit/cli_spec.rb +29 -0
- data/spec/unit/computer_spec.rb +31 -0
- data/spec/unit/ingredient_spec.rb +34 -0
- data/spec/unit/levenshtein_spec.rb +19 -0
- data/spec/unit/normalizer_rules_spec.rb +58 -0
- data/spec/unit/normalizer_spec.rb +31 -0
- data/spec/unit/recognizer_rules_spec.rb +46 -0
- data/spec/unit/recognizer_spec.rb +49 -0
- data/spec/unit/response_spec.rb +8 -0
- data/spec/unit/score_spec.rb +12 -0
- data/spec/unit/scorer_spec.rb +11 -0
- data/spec/unit/server_spec.rb +30 -0
- metadata +24 -23
- data/.gitignore +0 -13
- data/.travis.yml +0 -6
- data/bin/console +0 -7
- data/bin/setup +0 -6
- data/lib/inci_score/fetcher.rb +0 -41
- data/log/.gitignore +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3acb19f9ac522f6a50378ec318c6466ae9b67257
|
4
|
+
data.tar.gz: 3d46458825c9d1fb3ff74db0779b329b63c5f97a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ab029ca732a8605502a7263fa3d7fc96ad20be2ca6aaf9b142e599746eae8216bd326ae318b92aed19a24e157456fe63dd99642d52f536c0a3ea0a4cd59e5bd
|
7
|
+
data.tar.gz: 6d23d330b4267abc0d2ba4241c52d0f55d94922bc1dbb776842e3266afd699f366ade1c648f2e76c3ba2a85db7a680aac59257f683d9d85b5fb2b9b352eae536
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
inci_score (3.0.1)
|
5
|
+
RubyInline (~> 3)
|
6
|
+
puma (~> 3)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
RubyInline (3.12.4)
|
12
|
+
ZenTest (~> 4.3)
|
13
|
+
ZenTest (4.11.1)
|
14
|
+
benchmark-ips (2.7.2)
|
15
|
+
minitest (5.10.3)
|
16
|
+
puma (3.10.0)
|
17
|
+
rack (2.0.3)
|
18
|
+
rack-test (0.7.0)
|
19
|
+
rack (>= 1.0, < 3)
|
20
|
+
rake (10.5.0)
|
21
|
+
|
22
|
+
PLATFORMS
|
23
|
+
ruby
|
24
|
+
|
25
|
+
DEPENDENCIES
|
26
|
+
benchmark-ips (~> 2)
|
27
|
+
bundler (~> 1.11)
|
28
|
+
inci_score!
|
29
|
+
minitest (~> 5.0)
|
30
|
+
rack-test (~> 0.6)
|
31
|
+
rake (~> 10.0)
|
32
|
+
|
33
|
+
BUNDLED WITH
|
34
|
+
1.15.4
|
data/README.md
CHANGED
@@ -8,7 +8,6 @@
|
|
8
8
|
* [API](#api)
|
9
9
|
* [Unrecognized components](#unrecognized-components)
|
10
10
|
* [CLI](#cli)
|
11
|
-
* [Refresh catalog](#refresh-catalog)
|
12
11
|
* [HTTP server](#http-server)
|
13
12
|
* [Triggering a request](#triggering-a-request)
|
14
13
|
* [Getting help](#getting-help)
|
@@ -52,9 +51,7 @@ The API of the gem is pretty simple, you can open irb by *bundle console* and st
|
|
52
51
|
|
53
52
|
```ruby
|
54
53
|
inci = InciScore::Computer.new(src: 'aqua, dimethicone').call
|
55
|
-
|
56
|
-
inci.score
|
57
|
-
=> 53.762874945799766
|
54
|
+
inci.score # 53.7629
|
58
55
|
```
|
59
56
|
|
60
57
|
As you see the results are wrapped by an *InciScore::Response* object, this is useful when dealing with the CLI and HTTP interfaces (read below).
|
@@ -66,11 +63,8 @@ Is still possible to query the object for its state:
|
|
66
63
|
|
67
64
|
```ruby
|
68
65
|
inci = InciScore::Computer.new(src: 'ingredients:aqua,noent1,noent2').call
|
69
|
-
|
70
|
-
inci.
|
71
|
-
=> false
|
72
|
-
inci.unrecognized
|
73
|
-
=> ["noent1", "noent2"]
|
66
|
+
inci.valid # false
|
67
|
+
inci.unrecognized # ["noent1", "noent2"]
|
74
68
|
```
|
75
69
|
|
76
70
|
## CLI
|
@@ -80,7 +74,7 @@ You can collect INCI data by using the available CLI interface:
|
|
80
74
|
inci_score --src="ingredients: aqua, dimethicone, pej-10, noent"
|
81
75
|
|
82
76
|
TOTAL SCORE:
|
83
|
-
47.
|
77
|
+
47.1803
|
84
78
|
VALID STATE:
|
85
79
|
true
|
86
80
|
COMPONENTS (hazard - name):
|
@@ -91,12 +85,6 @@ UNRECOGNIZED:
|
|
91
85
|
noent
|
92
86
|
```
|
93
87
|
|
94
|
-
### Refresh catalog
|
95
|
-
You also have the option to fetch a fresh catalog from www.biodizionario.it by specifyng a flag:
|
96
|
-
```shell
|
97
|
-
inci_score --fresh --src="aqua, dimethicone"
|
98
|
-
```
|
99
|
-
|
100
88
|
### HTTP server
|
101
89
|
The CLI interface exposes a Web layer based on the [Puma](http://puma.io/) application server.
|
102
90
|
The HTTP server is started on the specified port by spawning as many workers as your current workstation supports:
|
@@ -111,17 +99,16 @@ You can pass the source string directly as a HTTP parameter (URI escaped):
|
|
111
99
|
|
112
100
|
```shell
|
113
101
|
curl http://127.0.0.1:9292?src=aqua,dimethicone
|
114
|
-
=> {"components":{"aqua":0,"dimethicone":4},"unrecognized":[],"score":53.
|
102
|
+
=> {"components":{"aqua":0,"dimethicone":4},"unrecognized":[],"score":53.7629,"valid":true}
|
115
103
|
```
|
116
104
|
|
117
105
|
### Getting help
|
118
106
|
You can get CLI interface help by:
|
119
107
|
```shell
|
120
|
-
Usage: inci_score --src="aqua, parfum, etc" --
|
108
|
+
Usage: inci_score --src="aqua, parfum, etc" --precise
|
121
109
|
-s, --src=SRC The INCI list: "aqua, parfum, etc"
|
122
|
-
-f, --fresh Fetch a fresh catalog from remote
|
123
110
|
-p, --precise Compute components more precisely (slower)
|
124
|
-
--http=PORT Start
|
111
|
+
--http=PORT Start HTTP server on the specified port
|
125
112
|
-h, --help Prints this help
|
126
113
|
```
|
127
114
|
|
@@ -140,13 +127,15 @@ I registered these benchmarks with a MacBook PRO 15 mid 2015 having these specs:
|
|
140
127
|
* Ruby 2.4
|
141
128
|
|
142
129
|
### Wrk
|
143
|
-
As always i used [wrk](https://github.com/wg/wrk) as the loading tool
|
130
|
+
As always i used [wrk](https://github.com/wg/wrk) as the loading tool.
|
144
131
|
I measured the library three times, picking the best lap.
|
145
132
|
```shell
|
146
|
-
wrk -t 4 -c 100 -d 30s --timeout 2000 http://0.0.0.0:9292/?src=
|
133
|
+
wrk -t 4 -c 100 -d 30s --timeout 2000 "http://0.0.0.0:9292/?src=<source>&precise=true"
|
147
134
|
```
|
148
135
|
|
149
136
|
### Results
|
150
|
-
| Throughput (req/s) |
|
151
|
-
|
|
152
|
-
|
|
137
|
+
| Source | Throughput (req/s) |
|
138
|
+
| --------------------------: | -----------------: |
|
139
|
+
| aqua,parfum,zeolite | 18784.21 |
|
140
|
+
| agua,porfum,zeolithe | 1087.88 |
|
141
|
+
| agua/water,porfum/fragrance | 1599.47 |
|
data/config/catalog.yml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
parfum:
|
4
|
-
fragrance:
|
2
|
+
aqua: 0
|
3
|
+
parfum: 3
|
4
|
+
fragrance: 3
|
5
5
|
phosphatidylcholine: 1
|
6
6
|
1-naphthol: 4
|
7
7
|
1,2,4-benzenetriacetate: 4
|
@@ -296,7 +296,6 @@ apricot kernel oil peg-6 esters: 3
|
|
296
296
|
apricotamide dea: 3
|
297
297
|
apricotamidopropyl betaine: 2
|
298
298
|
apricotamidopropyl ethyldimonium ethosulfate: 2
|
299
|
-
aqua: 0
|
300
299
|
arachideth-20: 3
|
301
300
|
arachidic acid: 1
|
302
301
|
arachidonic acid: 1
|
data/config.ru
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
require
|
1
|
+
require "inci_score/api"
|
2
2
|
|
3
|
-
run InciScore::
|
3
|
+
run InciScore::Api
|
data/inci_score.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
lib = File.expand_path('../lib', __FILE__)
|
2
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
-
require
|
3
|
+
require "inci_score/version"
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "inci_score"
|
@@ -9,14 +9,13 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.email = ["costajob@gmail.com"]
|
10
10
|
s.summary = %q{A library that computes the hazard of cosmetic products components, based on the Biodizionario data.}
|
11
11
|
s.homepage = "https://github.com/costajob/inci_score.git"
|
12
|
-
s.files =
|
12
|
+
s.files = %w(README.md Rakefile inci_score.gemspec Gemfile Gemfile.lock config.ru config/catalog.yml ext/levenshtein.c bin/inci_score) + Dir["{spec,lib}/**/*.rb"]
|
13
13
|
s.bindir = "bin"
|
14
14
|
s.executables << "inci_score"
|
15
15
|
s.require_paths = ["lib"]
|
16
16
|
s.license = "MIT"
|
17
17
|
s.required_ruby_version = ">= 2.2.2"
|
18
18
|
|
19
|
-
s.add_runtime_dependency "nokogiri", "~> 1.6"
|
20
19
|
s.add_runtime_dependency "puma", "~> 3"
|
21
20
|
s.add_runtime_dependency "RubyInline", "~> 3"
|
22
21
|
|
data/lib/inci_score/cli.rb
CHANGED
@@ -9,35 +9,30 @@ module InciScore
|
|
9
9
|
@io = io
|
10
10
|
@catalog = catalog
|
11
11
|
@src = nil
|
12
|
-
@fresh = nil
|
13
12
|
@port = nil
|
14
13
|
@precise = nil
|
15
14
|
end
|
16
15
|
|
17
|
-
def call(server_klass: Server, computer_klass: Computer
|
16
|
+
def call(server_klass: Server, computer_klass: Computer)
|
18
17
|
parser.parse!(@args)
|
19
18
|
return server_klass.new(port: @port, preload: true).run if @port
|
20
19
|
return @io.puts(%q{Specify inci list as: --src="aqua, parfum, etc"}) unless @src
|
21
|
-
@io.puts computer_klass.new(src: @src, catalog: catalog
|
20
|
+
@io.puts computer_klass.new(src: @src, catalog: @catalog, precise: @precise).call
|
22
21
|
end
|
23
22
|
|
24
23
|
private def parser
|
25
24
|
OptionParser.new do |opts|
|
26
|
-
opts.banner = %q{Usage: inci_score --src="aqua, parfum, etc" --
|
25
|
+
opts.banner = %q{Usage: inci_score --src="aqua, parfum, etc" --precise}
|
27
26
|
|
28
27
|
opts.on("-sSRC", "--src=SRC", %q{The INCI list: "aqua, parfum, etc"}) do |src|
|
29
28
|
@src = src
|
30
29
|
end
|
31
30
|
|
32
|
-
opts.on("-f", "--fresh", "Fetch a fresh catalog from remote") do |fresh|
|
33
|
-
@fresh = fresh
|
34
|
-
end
|
35
|
-
|
36
31
|
opts.on("-p", "--precise", "Compute components more precisely (slower)") do |precise|
|
37
32
|
@precise = precise
|
38
33
|
end
|
39
34
|
|
40
|
-
opts.on("--http=PORT", "Start
|
35
|
+
opts.on("--http=PORT", "Start HTTP server on the specified port") do |port|
|
41
36
|
@port = port
|
42
37
|
end
|
43
38
|
|
@@ -47,10 +42,5 @@ module InciScore
|
|
47
42
|
end
|
48
43
|
end
|
49
44
|
end
|
50
|
-
|
51
|
-
private def catalog(fetcher)
|
52
|
-
return @catalog unless @fresh
|
53
|
-
fetcher.call
|
54
|
-
end
|
55
45
|
end
|
56
46
|
end
|
data/lib/inci_score/computer.rb
CHANGED
@@ -1,13 +1,19 @@
|
|
1
|
+
require "inci_score/ingredient"
|
1
2
|
require "inci_score/normalizer"
|
2
3
|
require "inci_score/recognizer"
|
3
|
-
require "inci_score/scorer"
|
4
4
|
require "inci_score/response"
|
5
|
+
require "inci_score/scorer"
|
5
6
|
|
6
7
|
module InciScore
|
7
8
|
class Computer
|
8
9
|
TOLERANCE = 30.0
|
10
|
+
PERCENT = 100.0
|
9
11
|
|
10
|
-
def initialize(src:,
|
12
|
+
def initialize(src:,
|
13
|
+
catalog: Catalog.fetch,
|
14
|
+
tolerance: TOLERANCE,
|
15
|
+
rules: Normalizer::DEFAULT_RULES,
|
16
|
+
precise: false)
|
11
17
|
@src = src
|
12
18
|
@catalog = catalog
|
13
19
|
@tolerance = Float(tolerance)
|
@@ -17,18 +23,21 @@ module InciScore
|
|
17
23
|
end
|
18
24
|
|
19
25
|
def call
|
20
|
-
@response ||= Response.new(components: components.map(&:
|
26
|
+
@response ||= Response.new(components: components.map(&:name),
|
21
27
|
unrecognized: @unrecognized,
|
22
28
|
score: score,
|
23
29
|
valid: valid?)
|
24
30
|
end
|
25
31
|
|
26
32
|
private def score
|
27
|
-
Scorer.new(components.map(&:
|
33
|
+
Scorer.new(components.map(&:hazard)).call
|
28
34
|
end
|
29
35
|
|
30
36
|
private def ingredients
|
31
|
-
@ingredients ||=
|
37
|
+
@ingredients ||= begin
|
38
|
+
tokens = Normalizer.new(src: @src, rules: @rules).call
|
39
|
+
Ingredient.bulk(tokens)
|
40
|
+
end
|
32
41
|
end
|
33
42
|
|
34
43
|
private def components
|
@@ -40,7 +49,7 @@ module InciScore
|
|
40
49
|
end
|
41
50
|
|
42
51
|
private def valid?
|
43
|
-
@unrecognized.size / (ingredients.size /
|
52
|
+
@unrecognized.size / (ingredients.size / PERCENT) <= @tolerance
|
44
53
|
end
|
45
54
|
end
|
46
55
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module InciScore
|
2
|
+
class Ingredient
|
3
|
+
SLASH = "/"
|
4
|
+
SLASH_RULE = /(?<!ate)\//
|
5
|
+
PARENTHESIS = %w[( ) [ ]]
|
6
|
+
DETAILS_RULE = /(\(.+\)|\[.+\])/
|
7
|
+
|
8
|
+
def self.bulk(tokens)
|
9
|
+
tokens.map { |raw| new(raw) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(raw)
|
13
|
+
@raw = raw
|
14
|
+
@tokens = raw.split(SLASH_RULE).map(&:strip)
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
values.join(SLASH)
|
19
|
+
end
|
20
|
+
|
21
|
+
def values
|
22
|
+
@values ||= synonims.unshift(name).compact
|
23
|
+
end
|
24
|
+
|
25
|
+
private def name
|
26
|
+
return @tokens.first unless parenthesis?
|
27
|
+
@raw.sub(DETAILS_RULE, "").strip
|
28
|
+
end
|
29
|
+
|
30
|
+
private def synonims
|
31
|
+
@tokens[1, @tokens.size]
|
32
|
+
end
|
33
|
+
|
34
|
+
private def details
|
35
|
+
return unless parenthesis?
|
36
|
+
@raw.match(DETAILS_RULE)[1].delete(PARENTHESIS.join("|"))
|
37
|
+
end
|
38
|
+
|
39
|
+
private def parenthesis?
|
40
|
+
PARENTHESIS.each_slice(2).any? do |pair|
|
41
|
+
pair.all? { |p| @raw.index(p) }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -2,7 +2,7 @@ require "inci_score/normalizer_rules"
|
|
2
2
|
|
3
3
|
module InciScore
|
4
4
|
class Normalizer
|
5
|
-
DEFAULT_RULES = [Rules::Replacer, Rules::Downcaser, Rules::Beheader, Rules::Separator, Rules::Tokenizer, Rules::Sanitizer, Rules::
|
5
|
+
DEFAULT_RULES = [Rules::Replacer, Rules::Downcaser, Rules::Beheader, Rules::Separator, Rules::Tokenizer, Rules::Sanitizer, Rules::Uniquifier]
|
6
6
|
|
7
7
|
attr_reader :src
|
8
8
|
|
@@ -77,30 +77,6 @@ module InciScore
|
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
80
|
-
module Desynonymizer
|
81
|
-
extend self
|
82
|
-
|
83
|
-
SYNONYM = /\/.*/
|
84
|
-
|
85
|
-
def call(src)
|
86
|
-
Array(src).map do |token|
|
87
|
-
token.sub(SYNONYM, '').strip
|
88
|
-
end.reject(&:empty?)
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
module Deparenthesizer
|
93
|
-
extend self
|
94
|
-
|
95
|
-
PARENTHESIS = /\(.+?\)|\[.+?\]/
|
96
|
-
|
97
|
-
def call(src)
|
98
|
-
Array(src).map do |token|
|
99
|
-
token.sub(PARENTHESIS, '').strip
|
100
|
-
end.reject(&:empty?)
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
80
|
module Uniquifier
|
105
81
|
extend self
|
106
82
|
|
@@ -4,19 +4,37 @@ module InciScore
|
|
4
4
|
class Recognizer
|
5
5
|
DEFAULT_RULES = [Rules::Key, Rules::Levenshtein, Rules::Digits, Rules::Tokens]
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
Component = Struct.new(:name, :hazard)
|
8
|
+
|
9
|
+
attr_reader :applied
|
10
|
+
|
11
|
+
def initialize(ingredient, catalog, rules = DEFAULT_RULES)
|
12
|
+
@ingredient = ingredient
|
9
13
|
@catalog = catalog
|
10
14
|
@rules = rules
|
15
|
+
@applied = []
|
11
16
|
end
|
12
17
|
|
13
18
|
def call(precise = false)
|
14
|
-
|
19
|
+
return if @ingredient.to_s.empty?
|
20
|
+
component = find_component(precise)
|
21
|
+
return unless component
|
22
|
+
Component.new(component, @catalog[component])
|
23
|
+
end
|
24
|
+
|
25
|
+
private def find_component(precise)
|
26
|
+
@rules.reduce(nil) do |component, rule|
|
15
27
|
break(component) if component
|
16
|
-
|
17
|
-
rule
|
28
|
+
applied << rule
|
29
|
+
apply(rule, precise)
|
18
30
|
end
|
19
|
-
|
20
|
-
|
31
|
+
end
|
32
|
+
|
33
|
+
private def apply(rule, precise)
|
34
|
+
return rule.call(@ingredient.to_s, @catalog) unless precise
|
35
|
+
@ingredient.values.map do |value|
|
36
|
+
rule.call(value, @catalog)
|
37
|
+
end.find(&:itself)
|
38
|
+
end
|
21
39
|
end
|
22
40
|
end
|
@@ -9,7 +9,7 @@ module InciScore
|
|
9
9
|
module Key
|
10
10
|
extend self
|
11
11
|
|
12
|
-
def call(src, catalog
|
12
|
+
def call(src, catalog)
|
13
13
|
src if catalog.has_key?(src)
|
14
14
|
end
|
15
15
|
end
|
@@ -17,20 +17,24 @@ module InciScore
|
|
17
17
|
module Levenshtein
|
18
18
|
extend self
|
19
19
|
|
20
|
-
|
20
|
+
Result = Struct.new(:name, :distance) do
|
21
|
+
def tolerable?(size)
|
22
|
+
distance < TOLERANCE && distance <= (size-1)
|
23
|
+
end
|
24
|
+
end
|
21
25
|
|
22
|
-
def call(src, catalog
|
26
|
+
def call(src, catalog)
|
23
27
|
size = src.size
|
28
|
+
farthest = Result.new(nil, size)
|
24
29
|
initial = src[0]
|
25
|
-
|
26
|
-
next
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
min
|
30
|
+
result = catalog.reduce(farthest) do |nearest, (component, _)|
|
31
|
+
next nearest unless component.start_with?(initial)
|
32
|
+
next nearest if component.size > (size + TOLERANCE)
|
33
|
+
d = src.distance(component)
|
34
|
+
nearest = Result.new(component, d) if d < nearest.distance
|
35
|
+
nearest
|
32
36
|
end
|
33
|
-
|
37
|
+
result.name if result.tolerable?(size)
|
34
38
|
end
|
35
39
|
end
|
36
40
|
|
@@ -39,17 +43,10 @@ module InciScore
|
|
39
43
|
|
40
44
|
MIN_MEANINGFUL = 7
|
41
45
|
|
42
|
-
def call(src, catalog
|
46
|
+
def call(src, catalog)
|
43
47
|
return if src.size < TOLERANCE
|
44
|
-
digits = src[0,
|
45
|
-
catalog.detect
|
46
|
-
component.matches?(/^#{Regexp::escape(digits)}/)
|
47
|
-
end.to_a.first
|
48
|
-
end
|
49
|
-
|
50
|
-
def min_meaningful(precise)
|
51
|
-
return MIN_MEANINGFUL unless precise
|
52
|
-
MIN_MEANINGFUL + 2
|
48
|
+
digits = src[0, MIN_MEANINGFUL]
|
49
|
+
catalog.detect { |component, _| component.start_with?(digits) }.to_a.first
|
53
50
|
end
|
54
51
|
end
|
55
52
|
|
@@ -58,7 +55,7 @@ module InciScore
|
|
58
55
|
|
59
56
|
UNMATCHABLE = %w[extract oil sodium acid sulfate]
|
60
57
|
|
61
|
-
def call(src, catalog
|
58
|
+
def call(src, catalog)
|
62
59
|
tokens(src).each do |token|
|
63
60
|
catalog.each do |component, _|
|
64
61
|
return component if component.matches?(/\b#{Regexp.escape(token)}\b/)
|
data/lib/inci_score/response.rb
CHANGED
data/lib/inci_score/score.rb
CHANGED
data/lib/inci_score/scorer.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require "inci_score/score"
|
2
2
|
|
3
3
|
module InciScore
|
4
4
|
class Scorer
|
@@ -12,33 +12,31 @@ module InciScore
|
|
12
12
|
|
13
13
|
def call
|
14
14
|
return 0 if @hazards.empty?
|
15
|
-
100 - avg * HAZARD_PERCENT
|
15
|
+
(100 - avg * HAZARD_PERCENT).round(4)
|
16
16
|
end
|
17
17
|
|
18
|
-
private
|
19
|
-
|
20
|
-
def avg
|
18
|
+
private def avg
|
21
19
|
avg_weighted / @size.to_f
|
22
20
|
end
|
23
21
|
|
24
|
-
def avg_weighted
|
22
|
+
private def avg_weighted
|
25
23
|
return @hazards.reduce(&:+) if same_hazard?
|
26
24
|
weighted.reduce(0.0) do |acc,score|
|
27
25
|
acc += score.value
|
28
26
|
end
|
29
27
|
end
|
30
28
|
|
31
|
-
def same_hazard?
|
29
|
+
private def same_hazard?
|
32
30
|
@hazards.uniq.size == 1
|
33
31
|
end
|
34
32
|
|
35
|
-
def weighted
|
33
|
+
private def weighted
|
36
34
|
@hazards.each_with_index.map do |h,i|
|
37
35
|
Score.new(h, weight(i))
|
38
36
|
end
|
39
37
|
end
|
40
38
|
|
41
|
-
def weight(index)
|
39
|
+
private def weight(index)
|
42
40
|
Math.log(index+1, @size * WEIGHT_FACTOR)
|
43
41
|
end
|
44
42
|
end
|
data/lib/inci_score/version.rb
CHANGED
data/lib/inci_score.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require 'inci_score/cli'
|
1
|
+
require "inci_score/version"
|
2
|
+
require "inci_score/catalog"
|
3
|
+
require "inci_score/cli"
|
4
|
+
require "inci_score/api"
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "helper"
|
2
|
+
|
3
|
+
s, t = "agua", "aqua"
|
4
|
+
lev_ruby = InciScore::Levenshtein.new(s, t)
|
5
|
+
lev_c = InciScore::LevenshteinC.new
|
6
|
+
|
7
|
+
Benchmark.ips do |x|
|
8
|
+
x.report("levenshtein ruby") do
|
9
|
+
lev_ruby.call
|
10
|
+
end
|
11
|
+
|
12
|
+
x.report("levenshtein C") do
|
13
|
+
lev_c.call(s, s.size, t, t.size)
|
14
|
+
end
|
15
|
+
|
16
|
+
x.compare!
|
17
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require "helper"
|
2
|
+
|
3
|
+
replacer = InciScore::Normalizer::Rules::Replacer
|
4
|
+
downcaser = InciScore::Normalizer::Rules::Downcaser
|
5
|
+
beheader = InciScore::Normalizer::Rules::Beheader
|
6
|
+
separator = InciScore::Normalizer::Rules::Separator
|
7
|
+
tokenizer = InciScore::Normalizer::Rules::Tokenizer
|
8
|
+
sanitizer = InciScore::Normalizer::Rules::Sanitizer
|
9
|
+
uniquifier = InciScore::Normalizer::Rules::Uniquifier
|
10
|
+
src = "‘INGREDIENTS‘:\n\nCOCO—BETANE,AQUA/WATER,DIMETHICONE"
|
11
|
+
|
12
|
+
Benchmark.ips do |x|
|
13
|
+
x.report("replacer") do
|
14
|
+
replacer.call(src)
|
15
|
+
end
|
16
|
+
|
17
|
+
x.report("downcaser") do
|
18
|
+
downcaser.call(src)
|
19
|
+
end
|
20
|
+
|
21
|
+
x.report("beheader") do
|
22
|
+
beheader.call(src)
|
23
|
+
end
|
24
|
+
|
25
|
+
x.report("separator") do
|
26
|
+
separator.call(src)
|
27
|
+
end
|
28
|
+
|
29
|
+
x.report("tokenizer") do
|
30
|
+
tokenizer.call(src)
|
31
|
+
end
|
32
|
+
|
33
|
+
x.report("sanitizer") do
|
34
|
+
sanitizer.call(src)
|
35
|
+
end
|
36
|
+
|
37
|
+
x.report("uniquifier") do
|
38
|
+
uniquifier.call(src)
|
39
|
+
end
|
40
|
+
end
|