inci_score 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/README.md +117 -0
- data/Rakefile +18 -0
- data/bin/console +7 -0
- data/bin/inci_score +7 -0
- data/bin/setup +6 -0
- data/config/catalog.yml +5014 -0
- data/config.ru +3 -0
- data/ext/levenshtein.c +43 -0
- data/inci_score.gemspec +28 -0
- data/lib/inci_score/api/app.rb +21 -0
- data/lib/inci_score/catalog.rb +13 -0
- data/lib/inci_score/computer.rb +45 -0
- data/lib/inci_score/levenshtein.rb +55 -0
- data/lib/inci_score/normalizer.rb +21 -0
- data/lib/inci_score/normalizer_rules.rb +76 -0
- data/lib/inci_score/parser.rb +43 -0
- data/lib/inci_score/recognizer.rb +27 -0
- data/lib/inci_score/recognizer_rules.rb +75 -0
- data/lib/inci_score/response.rb +31 -0
- data/lib/inci_score/score.rb +19 -0
- data/lib/inci_score/scorer.rb +45 -0
- data/lib/inci_score/version.rb +3 -0
- data/lib/inci_score.rb +4 -0
- data/log/.gitignore +4 -0
- metadata +170 -0
data/config.ru
ADDED
data/ext/levenshtein.c
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
static int call(const char* word1, int len1, const char* word2, int len2) {
|
2
|
+
int matrix[len1 + 1][len2 + 1];
|
3
|
+
int i;
|
4
|
+
for (i = 0; i <= len1; i++) {
|
5
|
+
matrix[i][0] = i;
|
6
|
+
}
|
7
|
+
for (i = 0; i <= len2; i++) {
|
8
|
+
matrix[0][i] = i;
|
9
|
+
}
|
10
|
+
for (i = 1; i <= len1; i++) {
|
11
|
+
int j;
|
12
|
+
char c1;
|
13
|
+
|
14
|
+
c1 = word1[i-1];
|
15
|
+
for (j = 1; j <= len2; j++) {
|
16
|
+
char c2;
|
17
|
+
|
18
|
+
c2 = word2[j-1];
|
19
|
+
if (c1 == c2) {
|
20
|
+
matrix[i][j] = matrix[i-1][j-1];
|
21
|
+
}
|
22
|
+
else {
|
23
|
+
int delete;
|
24
|
+
int insert;
|
25
|
+
int substitute;
|
26
|
+
int minimum;
|
27
|
+
|
28
|
+
delete = matrix[i-1][j] + 1;
|
29
|
+
insert = matrix[i][j-1] + 1;
|
30
|
+
substitute = matrix[i-1][j-1] + 1;
|
31
|
+
minimum = delete;
|
32
|
+
if (insert < minimum) {
|
33
|
+
minimum = insert;
|
34
|
+
}
|
35
|
+
if (substitute < minimum) {
|
36
|
+
minimum = substitute;
|
37
|
+
}
|
38
|
+
matrix[i][j] = minimum;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
return matrix[len1][len2];
|
43
|
+
}
|
data/inci_score.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'inci_score/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "inci_score"
|
7
|
+
s.version = InciScore::VERSION
|
8
|
+
s.authors = ["costajob"]
|
9
|
+
s.email = ["costajob@gmail.com"]
|
10
|
+
s.summary = %q{A library that computes the hazard of cosmetic products components, based on the Biodizionario data.}
|
11
|
+
s.homepage = "https://github.com/costajob/inci_score.git"
|
12
|
+
s.license = "MIT"
|
13
|
+
s.required_ruby_version = ">= 2.0.0"
|
14
|
+
|
15
|
+
s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec|test|s|features)/}) }
|
16
|
+
s.bindir = "exe"
|
17
|
+
s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
|
20
|
+
s.add_runtime_dependency "nokogiri", "~> 1.6"
|
21
|
+
s.add_runtime_dependency "puma", "~> 3.0"
|
22
|
+
s.add_runtime_dependency "RubyInline", "~> 3.0"
|
23
|
+
|
24
|
+
s.add_development_dependency "bundler", "~> 1.11"
|
25
|
+
s.add_development_dependency "rake", "~> 10.0"
|
26
|
+
s.add_development_dependency "minitest", "~> 5.0"
|
27
|
+
s.add_development_dependency "rack-test", "~> 0.6"
|
28
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rack'
|
2
|
+
require 'inci_score'
|
3
|
+
|
4
|
+
module InciScore
|
5
|
+
module API
|
6
|
+
module App
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def catalog
|
10
|
+
@catalog ||= Catalog.fetch
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(env)
|
14
|
+
req = Rack::Request.new(env)
|
15
|
+
src = req.params["src"]
|
16
|
+
json = src ? Computer.new(src, catalog).call.to_json : %q({"error": "no valid source"})
|
17
|
+
['200', {'Content-Type' => 'application/json'}, [json]]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'inci_score/normalizer'
|
2
|
+
require 'inci_score/recognizer'
|
3
|
+
require 'inci_score/scorer'
|
4
|
+
require 'inci_score/response'
|
5
|
+
|
6
|
+
module InciScore
|
7
|
+
class Computer
|
8
|
+
TOLERANCE = 30.0
|
9
|
+
|
10
|
+
def initialize(src, catalog)
|
11
|
+
@src = src
|
12
|
+
@catalog = catalog
|
13
|
+
@unrecognized = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def call
|
17
|
+
@response ||= Response.new(components: components.map(&:first),
|
18
|
+
unrecognized: @unrecognized,
|
19
|
+
score: score,
|
20
|
+
valid: valid?)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def score
|
26
|
+
Scorer.new(components.map(&:last)).call
|
27
|
+
end
|
28
|
+
|
29
|
+
def ingredients
|
30
|
+
@ingredients ||= Normalizer.new(src: @src).call
|
31
|
+
end
|
32
|
+
|
33
|
+
def components
|
34
|
+
@components ||= ingredients.map do |ingredient|
|
35
|
+
Recognizer.new(ingredient, @catalog).call.tap do |component|
|
36
|
+
@unrecognized << ingredient unless component
|
37
|
+
end
|
38
|
+
end.compact
|
39
|
+
end
|
40
|
+
|
41
|
+
def valid?
|
42
|
+
@unrecognized.size / (ingredients.size / 100.0) <= TOLERANCE
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'inline'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class LevenshteinC
|
5
|
+
C_PROGRAM = File::expand_path('../../../ext/levenshtein.c', __FILE__)
|
6
|
+
|
7
|
+
inline(:C) do |builder|
|
8
|
+
builder.c File::read(C_PROGRAM)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class Levenshtein
|
13
|
+
def initialize(s, t)
|
14
|
+
@s = s.downcase.unpack("U*")
|
15
|
+
@t = t.downcase.unpack("U*")
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
n, m = @s.length, @t.length
|
20
|
+
|
21
|
+
return 0 if @s == @t
|
22
|
+
return m if n.zero?
|
23
|
+
return n if m.zero?
|
24
|
+
|
25
|
+
d = Array.new(m+1) { |i| i }
|
26
|
+
x = nil
|
27
|
+
|
28
|
+
n.times do |i|
|
29
|
+
e = i + 1
|
30
|
+
m.times do |j|
|
31
|
+
c = @s[i] == @t[j] ? 0 : 1
|
32
|
+
ins = d[j + 1] + 1
|
33
|
+
del = e + 1
|
34
|
+
sub = d[j] + c
|
35
|
+
x = ins < del ? ins : del
|
36
|
+
x = sub if sub < x
|
37
|
+
d[j] = e
|
38
|
+
e = x
|
39
|
+
end
|
40
|
+
d[m] = x
|
41
|
+
end
|
42
|
+
x
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
String::class_eval do
|
48
|
+
def distance_utf8(t)
|
49
|
+
InciScore::Levenshtein.new(self, t).call
|
50
|
+
end
|
51
|
+
|
52
|
+
def distance(t)
|
53
|
+
InciScore::LevenshteinC.new.call(self.downcase, self.size, t.downcase, t.size)
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'inci_score/normalizer_rules'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Normalizer
|
5
|
+
DEFAULT_RULES = Rules.constants - [:Base]
|
6
|
+
|
7
|
+
attr_reader :src
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@src = options[:src] || fail(ArgumentError, 'missing src')
|
11
|
+
@rules = options.fetch(:rules) { DEFAULT_RULES }
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
@rules.reduce(@src) do |src, name|
|
16
|
+
rule = Rules.const_get(name).new(src)
|
17
|
+
src = rule.call
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module InciScore
|
2
|
+
class Normalizer
|
3
|
+
module Rules
|
4
|
+
class Base
|
5
|
+
SEPARATOR = ','
|
6
|
+
|
7
|
+
def initialize(src)
|
8
|
+
@src = src
|
9
|
+
end
|
10
|
+
|
11
|
+
def call
|
12
|
+
fail NotImplementedError
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Replacer < Base
|
17
|
+
REPLACEMENTS = [
|
18
|
+
[/\n+|\t+/, ' '],
|
19
|
+
['‘', "'"],
|
20
|
+
['—', '-'],
|
21
|
+
['(', 'C'],
|
22
|
+
['_', ' '],
|
23
|
+
['~', '-'],
|
24
|
+
['|', 'l'],
|
25
|
+
[' I ', '/']
|
26
|
+
]
|
27
|
+
|
28
|
+
def call
|
29
|
+
REPLACEMENTS.reduce(@src) do |src, replacement|
|
30
|
+
invalid, valid = *replacement
|
31
|
+
src.index(invalid) ? src.gsub(invalid, valid) : src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class Downcaser < Base
|
37
|
+
def call
|
38
|
+
@src.downcase
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Beheader < Base
|
43
|
+
TITLE_SEP = ':'
|
44
|
+
MAX_INDEX = 50
|
45
|
+
|
46
|
+
def call
|
47
|
+
sep_index = @src.index(TITLE_SEP)
|
48
|
+
return @src if !sep_index || sep_index > MAX_INDEX
|
49
|
+
@src[sep_index+1, @src.size]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class Separator < Base
|
54
|
+
SEPARATORS = ["; ", ". ", " ' ", " - ", " : "]
|
55
|
+
|
56
|
+
def call
|
57
|
+
SEPARATORS.reduce(@src) do |src, separator|
|
58
|
+
src = src.gsub(separator, SEPARATOR)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class Tokenizer < Base
|
64
|
+
INVALID_CHARS = /[^\w\s-]/
|
65
|
+
|
66
|
+
def call
|
67
|
+
@src.split(SEPARATOR).map do |token|
|
68
|
+
token = token.sub(/\/.*/, '')
|
69
|
+
token = token.gsub(INVALID_CHARS, '')
|
70
|
+
token = token.strip
|
71
|
+
end.reject(&:empty?)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Parser
|
5
|
+
BIODIZIO_URI = 'http://www.biodizionario.it/biodizio.php'
|
6
|
+
SEMAPHORES = %w[vv v g r rr]
|
7
|
+
CSS_QUERY = 'table[width="751"] > tr > td img'
|
8
|
+
|
9
|
+
def initialize(src = nil)
|
10
|
+
@src = src || Thread.new { open(BIODIZIO_URI) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@components ||= Nokogiri::HTML(doc).css(CSS_QUERY).inject({}) do |acc, img|
|
15
|
+
hazard = semaphore(img.attr('src'))
|
16
|
+
name = img.next_sibling.next_sibling
|
17
|
+
desc = name.next_sibling.next_sibling
|
18
|
+
name, desc = desc, name if swap?(desc.text)
|
19
|
+
acc[normalize(name)] = SEMAPHORES.index(hazard)
|
20
|
+
acc
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def doc
|
27
|
+
@src.respond_to?(:value) ? @src.value : @src
|
28
|
+
end
|
29
|
+
|
30
|
+
def semaphore(src)
|
31
|
+
src.match(/(#{SEMAPHORES.join('|')}).gif$/)[1]
|
32
|
+
end
|
33
|
+
|
34
|
+
def normalize(node)
|
35
|
+
node.text.strip.downcase
|
36
|
+
end
|
37
|
+
|
38
|
+
def swap?(desc)
|
39
|
+
return false if desc.empty?
|
40
|
+
desc == desc.upcase
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'inci_score/recognizer_rules'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Recognizer
|
5
|
+
DEFAULT_RULES = Rules.constants - [:Base]
|
6
|
+
|
7
|
+
def initialize(src, catalog, rules = DEFAULT_RULES)
|
8
|
+
@src = src
|
9
|
+
@catalog = catalog
|
10
|
+
@rules = rules
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@component = apply_rules
|
15
|
+
return [@component, @catalog[@component]] if @component
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def apply_rules
|
21
|
+
@rules.reduce(nil) do |component, name|
|
22
|
+
rule = Rules.const_get(name).new(@src, @catalog)
|
23
|
+
component || rule.call
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'inci_score/levenshtein'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Recognizer
|
5
|
+
module Rules
|
6
|
+
class Base
|
7
|
+
TOLERANCE = 3
|
8
|
+
|
9
|
+
def initialize(src, catalog)
|
10
|
+
@src = src
|
11
|
+
@catalog = catalog
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
fail NotmplementedError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Key < Base
|
20
|
+
def call
|
21
|
+
@src if @catalog.has_key?(@src)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Levenshtein < Base
|
26
|
+
ALTERNATE_SEP = '/'
|
27
|
+
|
28
|
+
def call
|
29
|
+
size = @src.size
|
30
|
+
initial = @src[0]
|
31
|
+
component, distance = @catalog.reduce([nil, size]) do |min, (comp, _)|
|
32
|
+
next min unless comp.start_with?(initial)
|
33
|
+
match = (n = comp.index(ALTERNATE_SEP)) ? comp[0, n] : comp
|
34
|
+
next min if match.size > (size + TOLERANCE)
|
35
|
+
dist = @src.distance(match)
|
36
|
+
min = [comp, dist] if dist < min[1]
|
37
|
+
min
|
38
|
+
end
|
39
|
+
component unless distance > TOLERANCE || distance >= (size-1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Digits < Base
|
44
|
+
MIN_MEANINGFUL = 7
|
45
|
+
|
46
|
+
def call
|
47
|
+
return if @src.size < TOLERANCE
|
48
|
+
digits = @src[0, MIN_MEANINGFUL]
|
49
|
+
@catalog.detect do |component, _|
|
50
|
+
component.match(/^#{Regexp::escape(digits)}/)
|
51
|
+
end.to_a.first
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Tokens < Base
|
56
|
+
UNMATCHABLE = %w[extract oil sodium acid sulfate]
|
57
|
+
|
58
|
+
def call
|
59
|
+
tokens.each do |token|
|
60
|
+
@catalog.each do |component, _|
|
61
|
+
return component if component.match(/\b#{Regexp.escape(token)}\b/)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
nil
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def tokens
|
70
|
+
(@src.split(' ') - UNMATCHABLE).reject { |t| t.size < TOLERANCE }.sort_by!(&:size).reverse!
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Response
|
5
|
+
attr_reader :components, :score, :unrecognized, :valid
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@components = options.fetch(:components) { [] }
|
9
|
+
@score = options.fetch(:score) { 0.0 }
|
10
|
+
@unrecognized = options.fetch(:unrecognized) { [] }
|
11
|
+
@valid = options.fetch(:valid) { false }
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_json
|
15
|
+
{ components: @components, unrecognized: @unrecognized, score: @score, valid: @valid }.to_json
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
%Q{
|
20
|
+
TOTAL SCORE:
|
21
|
+
\t#{@score}
|
22
|
+
VALID STATE:
|
23
|
+
\t#{@valid}
|
24
|
+
COMPONENTS:
|
25
|
+
#{@components.map { |c| "\t#{c}" }.join("\n")}
|
26
|
+
UNRECOGNIZED:
|
27
|
+
#{@unrecognized.map { |c| "\t#{c}" }.join("\n")}
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module InciScore
|
2
|
+
class Score
|
3
|
+
attr_reader :value
|
4
|
+
|
5
|
+
def initialize(hazard, weight)
|
6
|
+
@hazard = hazard
|
7
|
+
@weight = weight
|
8
|
+
@value = compute
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def compute
|
14
|
+
(@hazard - @weight).tap do |s|
|
15
|
+
return 0.0 if s < 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'inci_score/score'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Scorer
|
5
|
+
HAZARD_PERCENT = 25
|
6
|
+
WEIGHT_FACTOR = 5
|
7
|
+
|
8
|
+
def initialize(hazards)
|
9
|
+
@hazards = Array(hazards)
|
10
|
+
@size = @hazards.size
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
return 0 if @hazards.empty?
|
15
|
+
100 - avg * HAZARD_PERCENT
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def avg
|
21
|
+
avg_weighted / @size.to_f
|
22
|
+
end
|
23
|
+
|
24
|
+
def avg_weighted
|
25
|
+
return @hazards.reduce(&:+) if same_hazard?
|
26
|
+
weighted.reduce(0.0) do |acc,score|
|
27
|
+
acc += score.value
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def same_hazard?
|
32
|
+
@hazards.uniq.size == 1
|
33
|
+
end
|
34
|
+
|
35
|
+
def weighted
|
36
|
+
@hazards.each_with_index.map do |h,i|
|
37
|
+
Score.new(h, weight(i))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def weight(index)
|
42
|
+
Math.log(index+1, @size * WEIGHT_FACTOR)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/inci_score.rb
ADDED
data/log/.gitignore
ADDED