inci_score 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/README.md +117 -0
- data/Rakefile +18 -0
- data/bin/console +7 -0
- data/bin/inci_score +7 -0
- data/bin/setup +6 -0
- data/config/catalog.yml +5014 -0
- data/config.ru +3 -0
- data/ext/levenshtein.c +43 -0
- data/inci_score.gemspec +28 -0
- data/lib/inci_score/api/app.rb +21 -0
- data/lib/inci_score/catalog.rb +13 -0
- data/lib/inci_score/computer.rb +45 -0
- data/lib/inci_score/levenshtein.rb +55 -0
- data/lib/inci_score/normalizer.rb +21 -0
- data/lib/inci_score/normalizer_rules.rb +76 -0
- data/lib/inci_score/parser.rb +43 -0
- data/lib/inci_score/recognizer.rb +27 -0
- data/lib/inci_score/recognizer_rules.rb +75 -0
- data/lib/inci_score/response.rb +31 -0
- data/lib/inci_score/score.rb +19 -0
- data/lib/inci_score/scorer.rb +45 -0
- data/lib/inci_score/version.rb +3 -0
- data/lib/inci_score.rb +4 -0
- data/log/.gitignore +4 -0
- metadata +170 -0
data/config.ru
ADDED
data/ext/levenshtein.c
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
static int call(const char* word1, int len1, const char* word2, int len2) {
|
2
|
+
int matrix[len1 + 1][len2 + 1];
|
3
|
+
int i;
|
4
|
+
for (i = 0; i <= len1; i++) {
|
5
|
+
matrix[i][0] = i;
|
6
|
+
}
|
7
|
+
for (i = 0; i <= len2; i++) {
|
8
|
+
matrix[0][i] = i;
|
9
|
+
}
|
10
|
+
for (i = 1; i <= len1; i++) {
|
11
|
+
int j;
|
12
|
+
char c1;
|
13
|
+
|
14
|
+
c1 = word1[i-1];
|
15
|
+
for (j = 1; j <= len2; j++) {
|
16
|
+
char c2;
|
17
|
+
|
18
|
+
c2 = word2[j-1];
|
19
|
+
if (c1 == c2) {
|
20
|
+
matrix[i][j] = matrix[i-1][j-1];
|
21
|
+
}
|
22
|
+
else {
|
23
|
+
int delete;
|
24
|
+
int insert;
|
25
|
+
int substitute;
|
26
|
+
int minimum;
|
27
|
+
|
28
|
+
delete = matrix[i-1][j] + 1;
|
29
|
+
insert = matrix[i][j-1] + 1;
|
30
|
+
substitute = matrix[i-1][j-1] + 1;
|
31
|
+
minimum = delete;
|
32
|
+
if (insert < minimum) {
|
33
|
+
minimum = insert;
|
34
|
+
}
|
35
|
+
if (substitute < minimum) {
|
36
|
+
minimum = substitute;
|
37
|
+
}
|
38
|
+
matrix[i][j] = minimum;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
return matrix[len1][len2];
|
43
|
+
}
|
data/inci_score.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'inci_score/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "inci_score"
|
7
|
+
s.version = InciScore::VERSION
|
8
|
+
s.authors = ["costajob"]
|
9
|
+
s.email = ["costajob@gmail.com"]
|
10
|
+
s.summary = %q{A library that computes the hazard of cosmetic products components, based on the Biodizionario data.}
|
11
|
+
s.homepage = "https://github.com/costajob/inci_score.git"
|
12
|
+
s.license = "MIT"
|
13
|
+
s.required_ruby_version = ">= 2.0.0"
|
14
|
+
|
15
|
+
s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec|test|s|features)/}) }
|
16
|
+
s.bindir = "exe"
|
17
|
+
s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
|
20
|
+
s.add_runtime_dependency "nokogiri", "~> 1.6"
|
21
|
+
s.add_runtime_dependency "puma", "~> 3.0"
|
22
|
+
s.add_runtime_dependency "RubyInline", "~> 3.0"
|
23
|
+
|
24
|
+
s.add_development_dependency "bundler", "~> 1.11"
|
25
|
+
s.add_development_dependency "rake", "~> 10.0"
|
26
|
+
s.add_development_dependency "minitest", "~> 5.0"
|
27
|
+
s.add_development_dependency "rack-test", "~> 0.6"
|
28
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rack'
|
2
|
+
require 'inci_score'
|
3
|
+
|
4
|
+
module InciScore
|
5
|
+
module API
|
6
|
+
module App
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def catalog
|
10
|
+
@catalog ||= Catalog.fetch
|
11
|
+
end
|
12
|
+
|
13
|
+
def call(env)
|
14
|
+
req = Rack::Request.new(env)
|
15
|
+
src = req.params["src"]
|
16
|
+
json = src ? Computer.new(src, catalog).call.to_json : %q({"error": "no valid source"})
|
17
|
+
['200', {'Content-Type' => 'application/json'}, [json]]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'inci_score/normalizer'
|
2
|
+
require 'inci_score/recognizer'
|
3
|
+
require 'inci_score/scorer'
|
4
|
+
require 'inci_score/response'
|
5
|
+
|
6
|
+
module InciScore
|
7
|
+
class Computer
|
8
|
+
TOLERANCE = 30.0
|
9
|
+
|
10
|
+
def initialize(src, catalog)
|
11
|
+
@src = src
|
12
|
+
@catalog = catalog
|
13
|
+
@unrecognized = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def call
|
17
|
+
@response ||= Response.new(components: components.map(&:first),
|
18
|
+
unrecognized: @unrecognized,
|
19
|
+
score: score,
|
20
|
+
valid: valid?)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def score
|
26
|
+
Scorer.new(components.map(&:last)).call
|
27
|
+
end
|
28
|
+
|
29
|
+
def ingredients
|
30
|
+
@ingredients ||= Normalizer.new(src: @src).call
|
31
|
+
end
|
32
|
+
|
33
|
+
def components
|
34
|
+
@components ||= ingredients.map do |ingredient|
|
35
|
+
Recognizer.new(ingredient, @catalog).call.tap do |component|
|
36
|
+
@unrecognized << ingredient unless component
|
37
|
+
end
|
38
|
+
end.compact
|
39
|
+
end
|
40
|
+
|
41
|
+
def valid?
|
42
|
+
@unrecognized.size / (ingredients.size / 100.0) <= TOLERANCE
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'inline'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class LevenshteinC
|
5
|
+
C_PROGRAM = File::expand_path('../../../ext/levenshtein.c', __FILE__)
|
6
|
+
|
7
|
+
inline(:C) do |builder|
|
8
|
+
builder.c File::read(C_PROGRAM)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class Levenshtein
|
13
|
+
def initialize(s, t)
|
14
|
+
@s = s.downcase.unpack("U*")
|
15
|
+
@t = t.downcase.unpack("U*")
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
n, m = @s.length, @t.length
|
20
|
+
|
21
|
+
return 0 if @s == @t
|
22
|
+
return m if n.zero?
|
23
|
+
return n if m.zero?
|
24
|
+
|
25
|
+
d = Array.new(m+1) { |i| i }
|
26
|
+
x = nil
|
27
|
+
|
28
|
+
n.times do |i|
|
29
|
+
e = i + 1
|
30
|
+
m.times do |j|
|
31
|
+
c = @s[i] == @t[j] ? 0 : 1
|
32
|
+
ins = d[j + 1] + 1
|
33
|
+
del = e + 1
|
34
|
+
sub = d[j] + c
|
35
|
+
x = ins < del ? ins : del
|
36
|
+
x = sub if sub < x
|
37
|
+
d[j] = e
|
38
|
+
e = x
|
39
|
+
end
|
40
|
+
d[m] = x
|
41
|
+
end
|
42
|
+
x
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
String::class_eval do
|
48
|
+
def distance_utf8(t)
|
49
|
+
InciScore::Levenshtein.new(self, t).call
|
50
|
+
end
|
51
|
+
|
52
|
+
def distance(t)
|
53
|
+
InciScore::LevenshteinC.new.call(self.downcase, self.size, t.downcase, t.size)
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'inci_score/normalizer_rules'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Normalizer
|
5
|
+
DEFAULT_RULES = Rules.constants - [:Base]
|
6
|
+
|
7
|
+
attr_reader :src
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@src = options[:src] || fail(ArgumentError, 'missing src')
|
11
|
+
@rules = options.fetch(:rules) { DEFAULT_RULES }
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
@rules.reduce(@src) do |src, name|
|
16
|
+
rule = Rules.const_get(name).new(src)
|
17
|
+
src = rule.call
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module InciScore
|
2
|
+
class Normalizer
|
3
|
+
module Rules
|
4
|
+
class Base
|
5
|
+
SEPARATOR = ','
|
6
|
+
|
7
|
+
def initialize(src)
|
8
|
+
@src = src
|
9
|
+
end
|
10
|
+
|
11
|
+
def call
|
12
|
+
fail NotImplementedError
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Replacer < Base
|
17
|
+
REPLACEMENTS = [
|
18
|
+
[/\n+|\t+/, ' '],
|
19
|
+
['‘', "'"],
|
20
|
+
['—', '-'],
|
21
|
+
['(', 'C'],
|
22
|
+
['_', ' '],
|
23
|
+
['~', '-'],
|
24
|
+
['|', 'l'],
|
25
|
+
[' I ', '/']
|
26
|
+
]
|
27
|
+
|
28
|
+
def call
|
29
|
+
REPLACEMENTS.reduce(@src) do |src, replacement|
|
30
|
+
invalid, valid = *replacement
|
31
|
+
src.index(invalid) ? src.gsub(invalid, valid) : src
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class Downcaser < Base
|
37
|
+
def call
|
38
|
+
@src.downcase
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Beheader < Base
|
43
|
+
TITLE_SEP = ':'
|
44
|
+
MAX_INDEX = 50
|
45
|
+
|
46
|
+
def call
|
47
|
+
sep_index = @src.index(TITLE_SEP)
|
48
|
+
return @src if !sep_index || sep_index > MAX_INDEX
|
49
|
+
@src[sep_index+1, @src.size]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class Separator < Base
|
54
|
+
SEPARATORS = ["; ", ". ", " ' ", " - ", " : "]
|
55
|
+
|
56
|
+
def call
|
57
|
+
SEPARATORS.reduce(@src) do |src, separator|
|
58
|
+
src = src.gsub(separator, SEPARATOR)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class Tokenizer < Base
|
64
|
+
INVALID_CHARS = /[^\w\s-]/
|
65
|
+
|
66
|
+
def call
|
67
|
+
@src.split(SEPARATOR).map do |token|
|
68
|
+
token = token.sub(/\/.*/, '')
|
69
|
+
token = token.gsub(INVALID_CHARS, '')
|
70
|
+
token = token.strip
|
71
|
+
end.reject(&:empty?)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Parser
|
5
|
+
BIODIZIO_URI = 'http://www.biodizionario.it/biodizio.php'
|
6
|
+
SEMAPHORES = %w[vv v g r rr]
|
7
|
+
CSS_QUERY = 'table[width="751"] > tr > td img'
|
8
|
+
|
9
|
+
def initialize(src = nil)
|
10
|
+
@src = src || Thread.new { open(BIODIZIO_URI) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@components ||= Nokogiri::HTML(doc).css(CSS_QUERY).inject({}) do |acc, img|
|
15
|
+
hazard = semaphore(img.attr('src'))
|
16
|
+
name = img.next_sibling.next_sibling
|
17
|
+
desc = name.next_sibling.next_sibling
|
18
|
+
name, desc = desc, name if swap?(desc.text)
|
19
|
+
acc[normalize(name)] = SEMAPHORES.index(hazard)
|
20
|
+
acc
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def doc
|
27
|
+
@src.respond_to?(:value) ? @src.value : @src
|
28
|
+
end
|
29
|
+
|
30
|
+
def semaphore(src)
|
31
|
+
src.match(/(#{SEMAPHORES.join('|')}).gif$/)[1]
|
32
|
+
end
|
33
|
+
|
34
|
+
def normalize(node)
|
35
|
+
node.text.strip.downcase
|
36
|
+
end
|
37
|
+
|
38
|
+
def swap?(desc)
|
39
|
+
return false if desc.empty?
|
40
|
+
desc == desc.upcase
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'inci_score/recognizer_rules'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Recognizer
|
5
|
+
DEFAULT_RULES = Rules.constants - [:Base]
|
6
|
+
|
7
|
+
def initialize(src, catalog, rules = DEFAULT_RULES)
|
8
|
+
@src = src
|
9
|
+
@catalog = catalog
|
10
|
+
@rules = rules
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
@component = apply_rules
|
15
|
+
return [@component, @catalog[@component]] if @component
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def apply_rules
|
21
|
+
@rules.reduce(nil) do |component, name|
|
22
|
+
rule = Rules.const_get(name).new(@src, @catalog)
|
23
|
+
component || rule.call
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'inci_score/levenshtein'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Recognizer
|
5
|
+
module Rules
|
6
|
+
class Base
|
7
|
+
TOLERANCE = 3
|
8
|
+
|
9
|
+
def initialize(src, catalog)
|
10
|
+
@src = src
|
11
|
+
@catalog = catalog
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
fail NotmplementedError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Key < Base
|
20
|
+
def call
|
21
|
+
@src if @catalog.has_key?(@src)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Levenshtein < Base
|
26
|
+
ALTERNATE_SEP = '/'
|
27
|
+
|
28
|
+
def call
|
29
|
+
size = @src.size
|
30
|
+
initial = @src[0]
|
31
|
+
component, distance = @catalog.reduce([nil, size]) do |min, (comp, _)|
|
32
|
+
next min unless comp.start_with?(initial)
|
33
|
+
match = (n = comp.index(ALTERNATE_SEP)) ? comp[0, n] : comp
|
34
|
+
next min if match.size > (size + TOLERANCE)
|
35
|
+
dist = @src.distance(match)
|
36
|
+
min = [comp, dist] if dist < min[1]
|
37
|
+
min
|
38
|
+
end
|
39
|
+
component unless distance > TOLERANCE || distance >= (size-1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Digits < Base
|
44
|
+
MIN_MEANINGFUL = 7
|
45
|
+
|
46
|
+
def call
|
47
|
+
return if @src.size < TOLERANCE
|
48
|
+
digits = @src[0, MIN_MEANINGFUL]
|
49
|
+
@catalog.detect do |component, _|
|
50
|
+
component.match(/^#{Regexp::escape(digits)}/)
|
51
|
+
end.to_a.first
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Tokens < Base
|
56
|
+
UNMATCHABLE = %w[extract oil sodium acid sulfate]
|
57
|
+
|
58
|
+
def call
|
59
|
+
tokens.each do |token|
|
60
|
+
@catalog.each do |component, _|
|
61
|
+
return component if component.match(/\b#{Regexp.escape(token)}\b/)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
nil
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def tokens
|
70
|
+
(@src.split(' ') - UNMATCHABLE).reject { |t| t.size < TOLERANCE }.sort_by!(&:size).reverse!
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Response
|
5
|
+
attr_reader :components, :score, :unrecognized, :valid
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
@components = options.fetch(:components) { [] }
|
9
|
+
@score = options.fetch(:score) { 0.0 }
|
10
|
+
@unrecognized = options.fetch(:unrecognized) { [] }
|
11
|
+
@valid = options.fetch(:valid) { false }
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_json
|
15
|
+
{ components: @components, unrecognized: @unrecognized, score: @score, valid: @valid }.to_json
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
%Q{
|
20
|
+
TOTAL SCORE:
|
21
|
+
\t#{@score}
|
22
|
+
VALID STATE:
|
23
|
+
\t#{@valid}
|
24
|
+
COMPONENTS:
|
25
|
+
#{@components.map { |c| "\t#{c}" }.join("\n")}
|
26
|
+
UNRECOGNIZED:
|
27
|
+
#{@unrecognized.map { |c| "\t#{c}" }.join("\n")}
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module InciScore
|
2
|
+
class Score
|
3
|
+
attr_reader :value
|
4
|
+
|
5
|
+
def initialize(hazard, weight)
|
6
|
+
@hazard = hazard
|
7
|
+
@weight = weight
|
8
|
+
@value = compute
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def compute
|
14
|
+
(@hazard - @weight).tap do |s|
|
15
|
+
return 0.0 if s < 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'inci_score/score'
|
2
|
+
|
3
|
+
module InciScore
|
4
|
+
class Scorer
|
5
|
+
HAZARD_PERCENT = 25
|
6
|
+
WEIGHT_FACTOR = 5
|
7
|
+
|
8
|
+
def initialize(hazards)
|
9
|
+
@hazards = Array(hazards)
|
10
|
+
@size = @hazards.size
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
return 0 if @hazards.empty?
|
15
|
+
100 - avg * HAZARD_PERCENT
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def avg
|
21
|
+
avg_weighted / @size.to_f
|
22
|
+
end
|
23
|
+
|
24
|
+
def avg_weighted
|
25
|
+
return @hazards.reduce(&:+) if same_hazard?
|
26
|
+
weighted.reduce(0.0) do |acc,score|
|
27
|
+
acc += score.value
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def same_hazard?
|
32
|
+
@hazards.uniq.size == 1
|
33
|
+
end
|
34
|
+
|
35
|
+
def weighted
|
36
|
+
@hazards.each_with_index.map do |h,i|
|
37
|
+
Score.new(h, weight(i))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def weight(index)
|
42
|
+
Math.log(index+1, @size * WEIGHT_FACTOR)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/inci_score.rb
ADDED
data/log/.gitignore
ADDED