inci_score 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/config.ru ADDED
@@ -0,0 +1,3 @@
1
+ require 'inci_score/api/app'
2
+
3
+ run InciScore::API::App
data/ext/levenshtein.c ADDED
@@ -0,0 +1,43 @@
1
+ static int call(const char* word1, int len1, const char* word2, int len2) {
2
+ int matrix[len1 + 1][len2 + 1];
3
+ int i;
4
+ for (i = 0; i <= len1; i++) {
5
+ matrix[i][0] = i;
6
+ }
7
+ for (i = 0; i <= len2; i++) {
8
+ matrix[0][i] = i;
9
+ }
10
+ for (i = 1; i <= len1; i++) {
11
+ int j;
12
+ char c1;
13
+
14
+ c1 = word1[i-1];
15
+ for (j = 1; j <= len2; j++) {
16
+ char c2;
17
+
18
+ c2 = word2[j-1];
19
+ if (c1 == c2) {
20
+ matrix[i][j] = matrix[i-1][j-1];
21
+ }
22
+ else {
23
+ int delete;
24
+ int insert;
25
+ int substitute;
26
+ int minimum;
27
+
28
+ delete = matrix[i-1][j] + 1;
29
+ insert = matrix[i][j-1] + 1;
30
+ substitute = matrix[i-1][j-1] + 1;
31
+ minimum = delete;
32
+ if (insert < minimum) {
33
+ minimum = insert;
34
+ }
35
+ if (substitute < minimum) {
36
+ minimum = substitute;
37
+ }
38
+ matrix[i][j] = minimum;
39
+ }
40
+ }
41
+ }
42
+ return matrix[len1][len2];
43
+ }
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'inci_score/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "inci_score"
7
+ s.version = InciScore::VERSION
8
+ s.authors = ["costajob"]
9
+ s.email = ["costajob@gmail.com"]
10
+ s.summary = %q{A library that computes the hazard of cosmetic products components, based on the Biodizionario data.}
11
+ s.homepage = "https://github.com/costajob/inci_score.git"
12
+ s.license = "MIT"
13
+ s.required_ruby_version = ">= 2.0.0"
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec|test|s|features)/}) }
16
+ s.bindir = "exe"
17
+ s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ s.require_paths = ["lib"]
19
+
20
+ s.add_runtime_dependency "nokogiri", "~> 1.6"
21
+ s.add_runtime_dependency "puma", "~> 3.0"
22
+ s.add_runtime_dependency "RubyInline", "~> 3.0"
23
+
24
+ s.add_development_dependency "bundler", "~> 1.11"
25
+ s.add_development_dependency "rake", "~> 10.0"
26
+ s.add_development_dependency "minitest", "~> 5.0"
27
+ s.add_development_dependency "rack-test", "~> 0.6"
28
+ end
@@ -0,0 +1,21 @@
1
+ require 'rack'
2
+ require 'inci_score'
3
+
4
+ module InciScore
5
+ module API
6
+ module App
7
+ extend self
8
+
9
+ def catalog
10
+ @catalog ||= Catalog.fetch
11
+ end
12
+
13
+ def call(env)
14
+ req = Rack::Request.new(env)
15
+ src = req.params["src"]
16
+ json = src ? Computer.new(src, catalog).call.to_json : %q({"error": "no valid source"})
17
+ ['200', {'Content-Type' => 'application/json'}, [json]]
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ require 'yaml'
2
+
3
+ module InciScore
4
+ module Catalog
5
+ extend self
6
+
7
+ YAML_PATH = File::expand_path('../../../config/catalog.yml', __FILE__)
8
+
9
+ def fetch(src = File.read(YAML_PATH))
10
+ YAML::load(src)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ require 'inci_score/normalizer'
2
+ require 'inci_score/recognizer'
3
+ require 'inci_score/scorer'
4
+ require 'inci_score/response'
5
+
6
+ module InciScore
7
+ class Computer
8
+ TOLERANCE = 30.0
9
+
10
+ def initialize(src, catalog)
11
+ @src = src
12
+ @catalog = catalog
13
+ @unrecognized = []
14
+ end
15
+
16
+ def call
17
+ @response ||= Response.new(components: components.map(&:first),
18
+ unrecognized: @unrecognized,
19
+ score: score,
20
+ valid: valid?)
21
+ end
22
+
23
+ private
24
+
25
+ def score
26
+ Scorer.new(components.map(&:last)).call
27
+ end
28
+
29
+ def ingredients
30
+ @ingredients ||= Normalizer.new(src: @src).call
31
+ end
32
+
33
+ def components
34
+ @components ||= ingredients.map do |ingredient|
35
+ Recognizer.new(ingredient, @catalog).call.tap do |component|
36
+ @unrecognized << ingredient unless component
37
+ end
38
+ end.compact
39
+ end
40
+
41
+ def valid?
42
+ @unrecognized.size / (ingredients.size / 100.0) <= TOLERANCE
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,55 @@
1
+ require 'inline'
2
+
3
+ module InciScore
4
+ class LevenshteinC
5
+ C_PROGRAM = File::expand_path('../../../ext/levenshtein.c', __FILE__)
6
+
7
+ inline(:C) do |builder|
8
+ builder.c File::read(C_PROGRAM)
9
+ end
10
+ end
11
+
12
+ class Levenshtein
13
+ def initialize(s, t)
14
+ @s = s.downcase.unpack("U*")
15
+ @t = t.downcase.unpack("U*")
16
+ end
17
+
18
+ def call
19
+ n, m = @s.length, @t.length
20
+
21
+ return 0 if @s == @t
22
+ return m if n.zero?
23
+ return n if m.zero?
24
+
25
+ d = Array.new(m+1) { |i| i }
26
+ x = nil
27
+
28
+ n.times do |i|
29
+ e = i + 1
30
+ m.times do |j|
31
+ c = @s[i] == @t[j] ? 0 : 1
32
+ ins = d[j + 1] + 1
33
+ del = e + 1
34
+ sub = d[j] + c
35
+ x = ins < del ? ins : del
36
+ x = sub if sub < x
37
+ d[j] = e
38
+ e = x
39
+ end
40
+ d[m] = x
41
+ end
42
+ x
43
+ end
44
+ end
45
+ end
46
+
47
+ String::class_eval do
48
+ def distance_utf8(t)
49
+ InciScore::Levenshtein.new(self, t).call
50
+ end
51
+
52
+ def distance(t)
53
+ InciScore::LevenshteinC.new.call(self.downcase, self.size, t.downcase, t.size)
54
+ end
55
+ end
@@ -0,0 +1,21 @@
1
+ require 'inci_score/normalizer_rules'
2
+
3
+ module InciScore
4
+ class Normalizer
5
+ DEFAULT_RULES = Rules.constants - [:Base]
6
+
7
+ attr_reader :src
8
+
9
+ def initialize(options = {})
10
+ @src = options[:src] || fail(ArgumentError, 'missing src')
11
+ @rules = options.fetch(:rules) { DEFAULT_RULES }
12
+ end
13
+
14
+ def call
15
+ @rules.reduce(@src) do |src, name|
16
+ rule = Rules.const_get(name).new(src)
17
+ src = rule.call
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,76 @@
1
+ module InciScore
2
+ class Normalizer
3
+ module Rules
4
+ class Base
5
+ SEPARATOR = ','
6
+
7
+ def initialize(src)
8
+ @src = src
9
+ end
10
+
11
+ def call
12
+ fail NotImplementedError
13
+ end
14
+ end
15
+
16
+ class Replacer < Base
17
+ REPLACEMENTS = [
18
+ [/\n+|\t+/, ' '],
19
+ ['‘', "'"],
20
+ ['—', '-'],
21
+ ['(', 'C'],
22
+ ['_', ' '],
23
+ ['~', '-'],
24
+ ['|', 'l'],
25
+ [' I ', '/']
26
+ ]
27
+
28
+ def call
29
+ REPLACEMENTS.reduce(@src) do |src, replacement|
30
+ invalid, valid = *replacement
31
+ src.index(invalid) ? src.gsub(invalid, valid) : src
32
+ end
33
+ end
34
+ end
35
+
36
+ class Downcaser < Base
37
+ def call
38
+ @src.downcase
39
+ end
40
+ end
41
+
42
+ class Beheader < Base
43
+ TITLE_SEP = ':'
44
+ MAX_INDEX = 50
45
+
46
+ def call
47
+ sep_index = @src.index(TITLE_SEP)
48
+ return @src if !sep_index || sep_index > MAX_INDEX
49
+ @src[sep_index+1, @src.size]
50
+ end
51
+ end
52
+
53
+ class Separator < Base
54
+ SEPARATORS = ["; ", ". ", " ' ", " - ", " : "]
55
+
56
+ def call
57
+ SEPARATORS.reduce(@src) do |src, separator|
58
+ src = src.gsub(separator, SEPARATOR)
59
+ end
60
+ end
61
+ end
62
+
63
+ class Tokenizer < Base
64
+ INVALID_CHARS = /[^\w\s-]/
65
+
66
+ def call
67
+ @src.split(SEPARATOR).map do |token|
68
+ token = token.sub(/\/.*/, '')
69
+ token = token.gsub(INVALID_CHARS, '')
70
+ token = token.strip
71
+ end.reject(&:empty?)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+
3
+ module InciScore
4
+ class Parser
5
+ BIODIZIO_URI = 'http://www.biodizionario.it/biodizio.php'
6
+ SEMAPHORES = %w[vv v g r rr]
7
+ CSS_QUERY = 'table[width="751"] > tr > td img'
8
+
9
+ def initialize(src = nil)
10
+ @src = src || Thread.new { open(BIODIZIO_URI) }
11
+ end
12
+
13
+ def call
14
+ @components ||= Nokogiri::HTML(doc).css(CSS_QUERY).inject({}) do |acc, img|
15
+ hazard = semaphore(img.attr('src'))
16
+ name = img.next_sibling.next_sibling
17
+ desc = name.next_sibling.next_sibling
18
+ name, desc = desc, name if swap?(desc.text)
19
+ acc[normalize(name)] = SEMAPHORES.index(hazard)
20
+ acc
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def doc
27
+ @src.respond_to?(:value) ? @src.value : @src
28
+ end
29
+
30
+ def semaphore(src)
31
+ src.match(/(#{SEMAPHORES.join('|')}).gif$/)[1]
32
+ end
33
+
34
+ def normalize(node)
35
+ node.text.strip.downcase
36
+ end
37
+
38
+ def swap?(desc)
39
+ return false if desc.empty?
40
+ desc == desc.upcase
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,27 @@
1
+ require 'inci_score/recognizer_rules'
2
+
3
+ module InciScore
4
+ class Recognizer
5
+ DEFAULT_RULES = Rules.constants - [:Base]
6
+
7
+ def initialize(src, catalog, rules = DEFAULT_RULES)
8
+ @src = src
9
+ @catalog = catalog
10
+ @rules = rules
11
+ end
12
+
13
+ def call
14
+ @component = apply_rules
15
+ return [@component, @catalog[@component]] if @component
16
+ end
17
+
18
+ private
19
+
20
+ def apply_rules
21
+ @rules.reduce(nil) do |component, name|
22
+ rule = Rules.const_get(name).new(@src, @catalog)
23
+ component || rule.call
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,75 @@
1
+ require 'inci_score/levenshtein'
2
+
3
+ module InciScore
4
+ class Recognizer
5
+ module Rules
6
+ class Base
7
+ TOLERANCE = 3
8
+
9
+ def initialize(src, catalog)
10
+ @src = src
11
+ @catalog = catalog
12
+ end
13
+
14
+ def call
15
+ fail NotmplementedError
16
+ end
17
+ end
18
+
19
+ class Key < Base
20
+ def call
21
+ @src if @catalog.has_key?(@src)
22
+ end
23
+ end
24
+
25
+ class Levenshtein < Base
26
+ ALTERNATE_SEP = '/'
27
+
28
+ def call
29
+ size = @src.size
30
+ initial = @src[0]
31
+ component, distance = @catalog.reduce([nil, size]) do |min, (comp, _)|
32
+ next min unless comp.start_with?(initial)
33
+ match = (n = comp.index(ALTERNATE_SEP)) ? comp[0, n] : comp
34
+ next min if match.size > (size + TOLERANCE)
35
+ dist = @src.distance(match)
36
+ min = [comp, dist] if dist < min[1]
37
+ min
38
+ end
39
+ component unless distance > TOLERANCE || distance >= (size-1)
40
+ end
41
+ end
42
+
43
+ class Digits < Base
44
+ MIN_MEANINGFUL = 7
45
+
46
+ def call
47
+ return if @src.size < TOLERANCE
48
+ digits = @src[0, MIN_MEANINGFUL]
49
+ @catalog.detect do |component, _|
50
+ component.match(/^#{Regexp::escape(digits)}/)
51
+ end.to_a.first
52
+ end
53
+ end
54
+
55
+ class Tokens < Base
56
+ UNMATCHABLE = %w[extract oil sodium acid sulfate]
57
+
58
+ def call
59
+ tokens.each do |token|
60
+ @catalog.each do |component, _|
61
+ return component if component.match(/\b#{Regexp.escape(token)}\b/)
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ private
68
+
69
+ def tokens
70
+ (@src.split(' ') - UNMATCHABLE).reject { |t| t.size < TOLERANCE }.sort_by!(&:size).reverse!
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,31 @@
1
+ require 'json'
2
+
3
+ module InciScore
4
+ class Response
5
+ attr_reader :components, :score, :unrecognized, :valid
6
+
7
+ def initialize(options = {})
8
+ @components = options.fetch(:components) { [] }
9
+ @score = options.fetch(:score) { 0.0 }
10
+ @unrecognized = options.fetch(:unrecognized) { [] }
11
+ @valid = options.fetch(:valid) { false }
12
+ end
13
+
14
+ def to_json
15
+ { components: @components, unrecognized: @unrecognized, score: @score, valid: @valid }.to_json
16
+ end
17
+
18
+ def to_s
19
+ %Q{
20
+ TOTAL SCORE:
21
+ \t#{@score}
22
+ VALID STATE:
23
+ \t#{@valid}
24
+ COMPONENTS:
25
+ #{@components.map { |c| "\t#{c}" }.join("\n")}
26
+ UNRECOGNIZED:
27
+ #{@unrecognized.map { |c| "\t#{c}" }.join("\n")}
28
+ }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,19 @@
1
+ module InciScore
2
+ class Score
3
+ attr_reader :value
4
+
5
+ def initialize(hazard, weight)
6
+ @hazard = hazard
7
+ @weight = weight
8
+ @value = compute
9
+ end
10
+
11
+ private
12
+
13
+ def compute
14
+ (@hazard - @weight).tap do |s|
15
+ return 0.0 if s < 0
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,45 @@
1
+ require 'inci_score/score'
2
+
3
+ module InciScore
4
+ class Scorer
5
+ HAZARD_PERCENT = 25
6
+ WEIGHT_FACTOR = 5
7
+
8
+ def initialize(hazards)
9
+ @hazards = Array(hazards)
10
+ @size = @hazards.size
11
+ end
12
+
13
+ def call
14
+ return 0 if @hazards.empty?
15
+ 100 - avg * HAZARD_PERCENT
16
+ end
17
+
18
+ private
19
+
20
+ def avg
21
+ avg_weighted / @size.to_f
22
+ end
23
+
24
+ def avg_weighted
25
+ return @hazards.reduce(&:+) if same_hazard?
26
+ weighted.reduce(0.0) do |acc,score|
27
+ acc += score.value
28
+ end
29
+ end
30
+
31
+ def same_hazard?
32
+ @hazards.uniq.size == 1
33
+ end
34
+
35
+ def weighted
36
+ @hazards.each_with_index.map do |h,i|
37
+ Score.new(h, weight(i))
38
+ end
39
+ end
40
+
41
+ def weight(index)
42
+ Math.log(index+1, @size * WEIGHT_FACTOR)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module InciScore
2
+ VERSION = "1.1.0"
3
+ end
data/lib/inci_score.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'inci_score/version'
2
+ require 'inci_score/parser'
3
+ require 'inci_score/catalog'
4
+ require 'inci_score/computer'
data/log/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ # Ignore everything in this directory
2
+ *
3
+ # Except this file
4
+ !.gitignore