inci_score 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/config.ru ADDED
@@ -0,0 +1,3 @@
1
+ require 'inci_score/api/app'
2
+
3
+ run InciScore::API::App
data/ext/levenshtein.c ADDED
@@ -0,0 +1,43 @@
1
+ static int call(const char* word1, int len1, const char* word2, int len2) {
2
+ int matrix[len1 + 1][len2 + 1];
3
+ int i;
4
+ for (i = 0; i <= len1; i++) {
5
+ matrix[i][0] = i;
6
+ }
7
+ for (i = 0; i <= len2; i++) {
8
+ matrix[0][i] = i;
9
+ }
10
+ for (i = 1; i <= len1; i++) {
11
+ int j;
12
+ char c1;
13
+
14
+ c1 = word1[i-1];
15
+ for (j = 1; j <= len2; j++) {
16
+ char c2;
17
+
18
+ c2 = word2[j-1];
19
+ if (c1 == c2) {
20
+ matrix[i][j] = matrix[i-1][j-1];
21
+ }
22
+ else {
23
+ int delete;
24
+ int insert;
25
+ int substitute;
26
+ int minimum;
27
+
28
+ delete = matrix[i-1][j] + 1;
29
+ insert = matrix[i][j-1] + 1;
30
+ substitute = matrix[i-1][j-1] + 1;
31
+ minimum = delete;
32
+ if (insert < minimum) {
33
+ minimum = insert;
34
+ }
35
+ if (substitute < minimum) {
36
+ minimum = substitute;
37
+ }
38
+ matrix[i][j] = minimum;
39
+ }
40
+ }
41
+ }
42
+ return matrix[len1][len2];
43
+ }
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'inci_score/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "inci_score"
7
+ s.version = InciScore::VERSION
8
+ s.authors = ["costajob"]
9
+ s.email = ["costajob@gmail.com"]
10
+ s.summary = %q{A library that computes the hazard of cosmetic products components, based on the Biodizionario data.}
11
+ s.homepage = "https://github.com/costajob/inci_score.git"
12
+ s.license = "MIT"
13
+ s.required_ruby_version = ">= 2.0.0"
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(spec|test|s|features)/}) }
16
+ s.bindir = "exe"
17
+ s.executables = s.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ s.require_paths = ["lib"]
19
+
20
+ s.add_runtime_dependency "nokogiri", "~> 1.6"
21
+ s.add_runtime_dependency "puma", "~> 3.0"
22
+ s.add_runtime_dependency "RubyInline", "~> 3.0"
23
+
24
+ s.add_development_dependency "bundler", "~> 1.11"
25
+ s.add_development_dependency "rake", "~> 10.0"
26
+ s.add_development_dependency "minitest", "~> 5.0"
27
+ s.add_development_dependency "rack-test", "~> 0.6"
28
+ end
@@ -0,0 +1,21 @@
1
+ require 'rack'
2
+ require 'inci_score'
3
+
4
+ module InciScore
5
+ module API
6
+ module App
7
+ extend self
8
+
9
+ def catalog
10
+ @catalog ||= Catalog.fetch
11
+ end
12
+
13
+ def call(env)
14
+ req = Rack::Request.new(env)
15
+ src = req.params["src"]
16
+ json = src ? Computer.new(src, catalog).call.to_json : %q({"error": "no valid source"})
17
+ ['200', {'Content-Type' => 'application/json'}, [json]]
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ require 'yaml'
2
+
3
+ module InciScore
4
+ module Catalog
5
+ extend self
6
+
7
+ YAML_PATH = File::expand_path('../../../config/catalog.yml', __FILE__)
8
+
9
+ def fetch(src = File.read(YAML_PATH))
10
+ YAML::load(src)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,45 @@
1
+ require 'inci_score/normalizer'
2
+ require 'inci_score/recognizer'
3
+ require 'inci_score/scorer'
4
+ require 'inci_score/response'
5
+
6
+ module InciScore
7
+ class Computer
8
+ TOLERANCE = 30.0
9
+
10
+ def initialize(src, catalog)
11
+ @src = src
12
+ @catalog = catalog
13
+ @unrecognized = []
14
+ end
15
+
16
+ def call
17
+ @response ||= Response.new(components: components.map(&:first),
18
+ unrecognized: @unrecognized,
19
+ score: score,
20
+ valid: valid?)
21
+ end
22
+
23
+ private
24
+
25
+ def score
26
+ Scorer.new(components.map(&:last)).call
27
+ end
28
+
29
+ def ingredients
30
+ @ingredients ||= Normalizer.new(src: @src).call
31
+ end
32
+
33
+ def components
34
+ @components ||= ingredients.map do |ingredient|
35
+ Recognizer.new(ingredient, @catalog).call.tap do |component|
36
+ @unrecognized << ingredient unless component
37
+ end
38
+ end.compact
39
+ end
40
+
41
+ def valid?
42
+ @unrecognized.size / (ingredients.size / 100.0) <= TOLERANCE
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,55 @@
1
+ require 'inline'
2
+
3
+ module InciScore
4
+ class LevenshteinC
5
+ C_PROGRAM = File::expand_path('../../../ext/levenshtein.c', __FILE__)
6
+
7
+ inline(:C) do |builder|
8
+ builder.c File::read(C_PROGRAM)
9
+ end
10
+ end
11
+
12
+ class Levenshtein
13
+ def initialize(s, t)
14
+ @s = s.downcase.unpack("U*")
15
+ @t = t.downcase.unpack("U*")
16
+ end
17
+
18
+ def call
19
+ n, m = @s.length, @t.length
20
+
21
+ return 0 if @s == @t
22
+ return m if n.zero?
23
+ return n if m.zero?
24
+
25
+ d = Array.new(m+1) { |i| i }
26
+ x = nil
27
+
28
+ n.times do |i|
29
+ e = i + 1
30
+ m.times do |j|
31
+ c = @s[i] == @t[j] ? 0 : 1
32
+ ins = d[j + 1] + 1
33
+ del = e + 1
34
+ sub = d[j] + c
35
+ x = ins < del ? ins : del
36
+ x = sub if sub < x
37
+ d[j] = e
38
+ e = x
39
+ end
40
+ d[m] = x
41
+ end
42
+ x
43
+ end
44
+ end
45
+ end
46
+
47
+ String::class_eval do
48
+ def distance_utf8(t)
49
+ InciScore::Levenshtein.new(self, t).call
50
+ end
51
+
52
+ def distance(t)
53
+ InciScore::LevenshteinC.new.call(self.downcase, self.size, t.downcase, t.size)
54
+ end
55
+ end
@@ -0,0 +1,21 @@
1
+ require 'inci_score/normalizer_rules'
2
+
3
+ module InciScore
4
+ class Normalizer
5
+ DEFAULT_RULES = Rules.constants - [:Base]
6
+
7
+ attr_reader :src
8
+
9
+ def initialize(options = {})
10
+ @src = options[:src] || fail(ArgumentError, 'missing src')
11
+ @rules = options.fetch(:rules) { DEFAULT_RULES }
12
+ end
13
+
14
+ def call
15
+ @rules.reduce(@src) do |src, name|
16
+ rule = Rules.const_get(name).new(src)
17
+ src = rule.call
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,76 @@
1
+ module InciScore
2
+ class Normalizer
3
+ module Rules
4
+ class Base
5
+ SEPARATOR = ','
6
+
7
+ def initialize(src)
8
+ @src = src
9
+ end
10
+
11
+ def call
12
+ fail NotImplementedError
13
+ end
14
+ end
15
+
16
+ class Replacer < Base
17
+ REPLACEMENTS = [
18
+ [/\n+|\t+/, ' '],
19
+ ['‘', "'"],
20
+ ['—', '-'],
21
+ ['(', 'C'],
22
+ ['_', ' '],
23
+ ['~', '-'],
24
+ ['|', 'l'],
25
+ [' I ', '/']
26
+ ]
27
+
28
+ def call
29
+ REPLACEMENTS.reduce(@src) do |src, replacement|
30
+ invalid, valid = *replacement
31
+ src.index(invalid) ? src.gsub(invalid, valid) : src
32
+ end
33
+ end
34
+ end
35
+
36
+ class Downcaser < Base
37
+ def call
38
+ @src.downcase
39
+ end
40
+ end
41
+
42
+ class Beheader < Base
43
+ TITLE_SEP = ':'
44
+ MAX_INDEX = 50
45
+
46
+ def call
47
+ sep_index = @src.index(TITLE_SEP)
48
+ return @src if !sep_index || sep_index > MAX_INDEX
49
+ @src[sep_index+1, @src.size]
50
+ end
51
+ end
52
+
53
+ class Separator < Base
54
+ SEPARATORS = ["; ", ". ", " ' ", " - ", " : "]
55
+
56
+ def call
57
+ SEPARATORS.reduce(@src) do |src, separator|
58
+ src = src.gsub(separator, SEPARATOR)
59
+ end
60
+ end
61
+ end
62
+
63
+ class Tokenizer < Base
64
+ INVALID_CHARS = /[^\w\s-]/
65
+
66
+ def call
67
+ @src.split(SEPARATOR).map do |token|
68
+ token = token.sub(/\/.*/, '')
69
+ token = token.gsub(INVALID_CHARS, '')
70
+ token = token.strip
71
+ end.reject(&:empty?)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+
3
+ module InciScore
4
+ class Parser
5
+ BIODIZIO_URI = 'http://www.biodizionario.it/biodizio.php'
6
+ SEMAPHORES = %w[vv v g r rr]
7
+ CSS_QUERY = 'table[width="751"] > tr > td img'
8
+
9
+ def initialize(src = nil)
10
+ @src = src || Thread.new { open(BIODIZIO_URI) }
11
+ end
12
+
13
+ def call
14
+ @components ||= Nokogiri::HTML(doc).css(CSS_QUERY).inject({}) do |acc, img|
15
+ hazard = semaphore(img.attr('src'))
16
+ name = img.next_sibling.next_sibling
17
+ desc = name.next_sibling.next_sibling
18
+ name, desc = desc, name if swap?(desc.text)
19
+ acc[normalize(name)] = SEMAPHORES.index(hazard)
20
+ acc
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def doc
27
+ @src.respond_to?(:value) ? @src.value : @src
28
+ end
29
+
30
+ def semaphore(src)
31
+ src.match(/(#{SEMAPHORES.join('|')}).gif$/)[1]
32
+ end
33
+
34
+ def normalize(node)
35
+ node.text.strip.downcase
36
+ end
37
+
38
+ def swap?(desc)
39
+ return false if desc.empty?
40
+ desc == desc.upcase
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,27 @@
1
+ require 'inci_score/recognizer_rules'
2
+
3
+ module InciScore
4
+ class Recognizer
5
+ DEFAULT_RULES = Rules.constants - [:Base]
6
+
7
+ def initialize(src, catalog, rules = DEFAULT_RULES)
8
+ @src = src
9
+ @catalog = catalog
10
+ @rules = rules
11
+ end
12
+
13
+ def call
14
+ @component = apply_rules
15
+ return [@component, @catalog[@component]] if @component
16
+ end
17
+
18
+ private
19
+
20
+ def apply_rules
21
+ @rules.reduce(nil) do |component, name|
22
+ rule = Rules.const_get(name).new(@src, @catalog)
23
+ component || rule.call
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,75 @@
1
+ require 'inci_score/levenshtein'
2
+
3
+ module InciScore
4
+ class Recognizer
5
+ module Rules
6
+ class Base
7
+ TOLERANCE = 3
8
+
9
+ def initialize(src, catalog)
10
+ @src = src
11
+ @catalog = catalog
12
+ end
13
+
14
+ def call
15
+ fail NotmplementedError
16
+ end
17
+ end
18
+
19
+ class Key < Base
20
+ def call
21
+ @src if @catalog.has_key?(@src)
22
+ end
23
+ end
24
+
25
+ class Levenshtein < Base
26
+ ALTERNATE_SEP = '/'
27
+
28
+ def call
29
+ size = @src.size
30
+ initial = @src[0]
31
+ component, distance = @catalog.reduce([nil, size]) do |min, (comp, _)|
32
+ next min unless comp.start_with?(initial)
33
+ match = (n = comp.index(ALTERNATE_SEP)) ? comp[0, n] : comp
34
+ next min if match.size > (size + TOLERANCE)
35
+ dist = @src.distance(match)
36
+ min = [comp, dist] if dist < min[1]
37
+ min
38
+ end
39
+ component unless distance > TOLERANCE || distance >= (size-1)
40
+ end
41
+ end
42
+
43
+ class Digits < Base
44
+ MIN_MEANINGFUL = 7
45
+
46
+ def call
47
+ return if @src.size < TOLERANCE
48
+ digits = @src[0, MIN_MEANINGFUL]
49
+ @catalog.detect do |component, _|
50
+ component.match(/^#{Regexp::escape(digits)}/)
51
+ end.to_a.first
52
+ end
53
+ end
54
+
55
+ class Tokens < Base
56
+ UNMATCHABLE = %w[extract oil sodium acid sulfate]
57
+
58
+ def call
59
+ tokens.each do |token|
60
+ @catalog.each do |component, _|
61
+ return component if component.match(/\b#{Regexp.escape(token)}\b/)
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ private
68
+
69
+ def tokens
70
+ (@src.split(' ') - UNMATCHABLE).reject { |t| t.size < TOLERANCE }.sort_by!(&:size).reverse!
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,31 @@
1
+ require 'json'
2
+
3
+ module InciScore
4
+ class Response
5
+ attr_reader :components, :score, :unrecognized, :valid
6
+
7
+ def initialize(options = {})
8
+ @components = options.fetch(:components) { [] }
9
+ @score = options.fetch(:score) { 0.0 }
10
+ @unrecognized = options.fetch(:unrecognized) { [] }
11
+ @valid = options.fetch(:valid) { false }
12
+ end
13
+
14
+ def to_json
15
+ { components: @components, unrecognized: @unrecognized, score: @score, valid: @valid }.to_json
16
+ end
17
+
18
+ def to_s
19
+ %Q{
20
+ TOTAL SCORE:
21
+ \t#{@score}
22
+ VALID STATE:
23
+ \t#{@valid}
24
+ COMPONENTS:
25
+ #{@components.map { |c| "\t#{c}" }.join("\n")}
26
+ UNRECOGNIZED:
27
+ #{@unrecognized.map { |c| "\t#{c}" }.join("\n")}
28
+ }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,19 @@
1
+ module InciScore
2
+ class Score
3
+ attr_reader :value
4
+
5
+ def initialize(hazard, weight)
6
+ @hazard = hazard
7
+ @weight = weight
8
+ @value = compute
9
+ end
10
+
11
+ private
12
+
13
+ def compute
14
+ (@hazard - @weight).tap do |s|
15
+ return 0.0 if s < 0
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,45 @@
1
+ require 'inci_score/score'
2
+
3
+ module InciScore
4
+ class Scorer
5
+ HAZARD_PERCENT = 25
6
+ WEIGHT_FACTOR = 5
7
+
8
+ def initialize(hazards)
9
+ @hazards = Array(hazards)
10
+ @size = @hazards.size
11
+ end
12
+
13
+ def call
14
+ return 0 if @hazards.empty?
15
+ 100 - avg * HAZARD_PERCENT
16
+ end
17
+
18
+ private
19
+
20
+ def avg
21
+ avg_weighted / @size.to_f
22
+ end
23
+
24
+ def avg_weighted
25
+ return @hazards.reduce(&:+) if same_hazard?
26
+ weighted.reduce(0.0) do |acc,score|
27
+ acc += score.value
28
+ end
29
+ end
30
+
31
+ def same_hazard?
32
+ @hazards.uniq.size == 1
33
+ end
34
+
35
+ def weighted
36
+ @hazards.each_with_index.map do |h,i|
37
+ Score.new(h, weight(i))
38
+ end
39
+ end
40
+
41
+ def weight(index)
42
+ Math.log(index+1, @size * WEIGHT_FACTOR)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module InciScore
2
+ VERSION = "1.1.0"
3
+ end
data/lib/inci_score.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'inci_score/version'
2
+ require 'inci_score/parser'
3
+ require 'inci_score/catalog'
4
+ require 'inci_score/computer'
data/log/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ # Ignore everything in this directory
2
+ *
3
+ # Except this file
4
+ !.gitignore