food_ingredient_parser 1.0.0.pre.8 → 1.0.0.pre.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b8042180a4a4fbc5233a5630c7e0cf8e4751182b
4
- data.tar.gz: 26abeaf528a49a6f01a47eb114e35631d9347cc7
3
+ metadata.gz: f08e6fcc9422b83503d37b41111f3bd540c11909
4
+ data.tar.gz: 16af14258ae67fa9b03b2b2196da9631d3dc6a9d
5
5
  SHA512:
6
- metadata.gz: 6d7c7972846a88046760de7d1c5857f426891502fdca4f50c0fad179f2a7580dd0b157aed106e9efa62af013e4a20d6d8e3be9c49ec8e4eeee326cb228e26c91
7
- data.tar.gz: 872aadc53b40e991e156fde3bb89db69bebb52ca8993aadfbc1f3c862b76b0d42ac01b377da70af22cf6b909b24b063a16d59f1aab4ed6ddd4917e1196736c50
6
+ metadata.gz: b1bfd6c713f0117cc8c13f4110624d270cb471096577e2dc47ea43f6f70664c88c9a2bec444bac407f9424b3c147245e16d04b5b0e9bf8442273c8d90c27955c
7
+ data.tar.gz: 873de15303ea9bebb4ab9d3504851a4b364e1aef849f6c530235476a4d8e65867fe5426dd7b9274ecd99befb2de83746b8854a6ba08a24cb8caac7525973340b
data/README.md CHANGED
@@ -29,7 +29,7 @@ Results in
29
29
  ```ruby
30
30
  {
31
31
  :contains=>[
32
- {:name=>"Water", :amount=>"60%", :mark=>"*"},
32
+ {:name=>"Water", :amount=>"60%", :marks=>["*"]},
33
33
  {:name=>"suiker", :amount=>"30%"},
34
34
  {:name=>"voedingszuren", :contains=>[
35
35
  {:name=>"citroenzuur"}
@@ -165,8 +165,9 @@ Even though the strict parser would not give a result, the loose parser returns:
165
165
  {
166
166
  :contains=>[
167
167
  {:name=>"Saus", :contains=>[
168
- {:name=>"tomaat", :mark=>"*", :amount=>"10%"},
169
- {:contains=>[{:name=>"zout"}]},
168
+ {:name=>"tomaat", :marks=>["*"], :amount=>"10%", {
169
+ :contains=>[{:name=>"zout"}
170
+ ]},
170
171
  {:name=>"peper"}
171
172
  ]}
172
173
  ]
@@ -28,7 +28,7 @@ module FoodIngredientParser::Loose
28
28
  def to_h
29
29
  r = {}
30
30
  r[:name] = name.text_value.strip if name && name.text_value.strip != ''
31
- r[:mark] = mark.text_value.strip if mark
31
+ r[:marks] = [mark.text_value.strip] if mark
32
32
  r[:amount] = amount.text_value.strip if amount
33
33
  r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
34
34
  r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
@@ -1,6 +1,7 @@
1
1
  require_relative '../cleaner'
2
2
  require_relative 'scanner'
3
3
  require_relative 'transform/amount'
4
+ require_relative 'transform/handle_missing_name'
4
5
 
5
6
  module FoodIngredientParser::Loose
6
7
  class Parser
@@ -13,11 +14,13 @@ module FoodIngredientParser::Loose
13
14
  # Parse food ingredient list text into a structured representation.
14
15
  #
15
16
  # @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
17
+ # @option normalize [Boolean] pass +false+ to disable some normalizations (handling missing names)
16
18
  # @return [FoodIngredientParser::Loose::Node] structured representation of food ingredients
17
- def parse(s, clean: true, **options)
19
+ def parse(s, clean: true, normalize: true, **options)
18
20
  s = FoodIngredientParser::Cleaner.clean(s) if clean
19
21
  n = Scanner.new(s).scan
20
22
  n = Transform::Amount.transform!(n) if n
23
+ n = Transform::HandleMissingName.transform!(n) if n && normalize
21
24
  n
22
25
  end
23
26
  end
@@ -5,8 +5,15 @@ module FoodIngredientParser::Loose
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
7
  MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
8
- PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\s*[:;.]\s*/i.freeze
8
+ PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
9
  NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
10
+ # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
+ ABBREV_RE = Regexp.union(%w[
12
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
13
+ i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
14
+ p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat N° °C
15
+ min max ca
16
+ ].map {|s| /\A#{Regexp.escape(s)}\b\.?/})
10
17
 
11
18
  def initialize(s, index: 0)
12
19
  @s = s # input string
@@ -45,7 +52,10 @@ module FoodIngredientParser::Loose
45
52
  end
46
53
 
47
54
  def scan_iteration_standard
48
- if "([".include?(c) # open nesting
55
+ if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
56
+ cur # reference to record starting position
57
+ @i += len - 1
58
+ elsif "([".include?(c) # open nesting
49
59
  open_parent
50
60
  elsif ")]".include?(c) # close nesting
51
61
  add_child
@@ -57,8 +67,12 @@ module FoodIngredientParser::Loose
57
67
  elsif is_sep? # separator
58
68
  add_child
59
69
  elsif ":".include?(c) # another open nesting
60
- open_parent(auto_close: true)
61
- @iterator = :colon
70
+ if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
71
+ # ignore if before an open bracket, then it's a regular nesting
72
+ else
73
+ open_parent(auto_close: true)
74
+ @iterator = :colon
75
+ end
62
76
  elsif is_mark? && !cur.mark # mark after ingredient
63
77
  name_until_here
64
78
  len = mark_len
@@ -70,7 +84,10 @@ module FoodIngredientParser::Loose
70
84
  end
71
85
 
72
86
  def scan_iteration_colon
73
- if "/".include?(c) # slash separator in colon nesting only
87
+ if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
88
+ cur # reference to record starting position
89
+ @i += len - 1
90
+ elsif "/".include?(c) # slash separator in colon nesting only
74
91
  add_child
75
92
  elsif is_sep? # regular separator indicates end of colon nesting
76
93
  add_child
@@ -108,14 +125,14 @@ module FoodIngredientParser::Loose
108
125
  @cur ||= Node.new(@s, @i)
109
126
  end
110
127
 
111
- def is_mark?
112
- mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
113
- end
114
-
115
128
  def is_sep?(chars: SEP_CHARS)
116
129
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
117
130
  end
118
131
 
132
+ def is_mark?
133
+ mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
134
+ end
135
+
119
136
  def mark_len
120
137
  i = @i
121
138
  while @s[i] && MARK_CHARS.include?(@s[i])
@@ -124,6 +141,11 @@ module FoodIngredientParser::Loose
124
141
  i - @i
125
142
  end
126
143
 
144
+ def abbrev_len
145
+ m = @s[@i .. -1].match(ABBREV_RE)
146
+ m ? m.offset(0).last : 0
147
+ end
148
+
127
149
  def is_notes_start?
128
150
  # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
129
151
  if ( is_mark? && @s[@i+mark_len..-1] =~ /\A\s*=/ ) || # "* = Biologisch"
@@ -0,0 +1,64 @@
1
+ module FoodIngredientParser::Loose
2
+ module Transform
3
+ # Transforms node tree to handle missing names.
4
+ #
5
+ # The loose parser can return a node tree that has some ingredients without a name.
6
+ # Usually this means that either the parser wasn't smart enough to understand the input,
7
+ # or the input was not strictly clear (e.g. a case like "herbs, (oregano), salt" is often seen).
8
+ #
9
+ # When a contained node is found which doesn't have a name:
10
+ # * For the amount (if any): ignore it (as it's often ambiguous which ingredient it belongs to)
11
+ # * For the marks (if any): ignore it (we might instead add it to the containing ingredients)
12
+ # * For the containing ingredients (if any):
13
+ # - if the previous ingredient is present and doesn't contain ingredients already,
14
+ # assume the current contained ingredients are actually part of the previous ingredient.
15
+ # - if there is no previous ingredient, assume the nesting is wrong and insert them before
16
+ # the other ingredients one depth level above.
17
+ # - if there is a previous ingredient which contains ingredients, we can't make much of it,
18
+ # to avoid losing them, add them as contained ingredients to the previous ingredient.
19
+ #
20
+ class HandleMissingName
21
+ def self.transform!(node)
22
+ new(node).transform!
23
+ end
24
+
25
+ def initialize(node)
26
+ @node = node
27
+ end
28
+
29
+ def transform!
30
+ transform_children!(@node)
31
+ @node
32
+ end
33
+
34
+ private
35
+
36
+ def transform_children!(node)
37
+ prev = nil
38
+ new_contains = []
39
+ node.contains.each do |child|
40
+ # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
41
+ transform_children!(child) if child.contains.any?
42
+
43
+ if child.name.nil? || child.name.text_value.strip == ''
44
+ # Name is empty, we need to do something.
45
+ if prev
46
+ # there is a previous ingredient: move children to new parent
47
+ prev.contains.push(*child.contains)
48
+ else
49
+ # there is no previous ingredient: move children one level up
50
+ new_contains.push(*child.contains)
51
+ end
52
+ else
53
+ # Nothing to see here, just leave it as it is.
54
+ new_contains << child
55
+ end
56
+
57
+ prev = child
58
+ end
59
+
60
+ node.contains = new_contains
61
+ end
62
+ end
63
+ end
64
+ end
@@ -3,29 +3,35 @@ module FoodIngredientParser::Strict::Grammar
3
3
  include Common
4
4
 
5
5
  rule amount
6
- '(' ws* amount:simple_amount ws* ')' <AmountNode> /
7
- '[' ws* amount:simple_amount ws* ']' <AmountNode> /
8
- '{' ws* amount:simple_amount ws* '}' <AmountNode> /
9
- amount:simple_amount <AmountNode>
6
+ '(' ws* amount:amount_simple ws* ')' <AmountNode> /
7
+ '[' ws* amount:amount_simple ws* ']' <AmountNode> /
8
+ '{' ws* amount:amount_simple ws* '}' <AmountNode> /
9
+ amount:amount_simple <AmountNode>
10
10
  end
11
11
 
12
- rule simple_amount
12
+ rule amount_simple
13
13
  ( (
14
14
  'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
15
15
  'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i /
16
- 'min.'i / 'min'i / 'max.'i / 'max'i
16
+ 'min.'i / 'min'i / 'max.'i / 'max'i / 'c.a.'i / 'ca.'i / 'ca'i
17
17
  ) ws* )?
18
- [±∓~∼∽≂≃≈≲≤<>≥≳]? ws*
19
- simple_amount_quantity
18
+ amount_simple_quantity
20
19
  ( ws+ (
21
20
  'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
22
21
  'min.'i / 'min'i / 'max.'i / 'max'i
23
22
  ) )?
24
23
  end
25
24
 
26
- rule simple_amount_quantity
27
- number ( ws* '-' ws* number )? ws* ( [%٪⁒%﹪] / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'g'i ) !char ) )
25
+ rule amount_simple_quantity
26
+ amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
27
+ end
28
+
29
+ rule amount_simple_number
30
+ ( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
28
31
  end
29
32
 
33
+ rule amount_simple_unit
34
+ ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
35
+ end
30
36
  end
31
37
  end
@@ -31,10 +31,18 @@ module FoodIngredientParser::Strict::Grammar
31
31
  [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
32
32
  end
33
33
 
34
+ rule percent
35
+ [%٪⁒%﹪]
36
+ end
37
+
34
38
  rule number
35
39
  digit+ [,.] digit+ / digit+ ws* fraction / fraction / digit+
36
40
  end
37
41
 
42
+ rule dash
43
+ [-֊ ‐ ‑ ‒ – — ― ﹘﹣-]
44
+ end
45
+
38
46
  rule word
39
47
  abbrev / char+
40
48
  end
@@ -50,6 +58,9 @@ module FoodIngredientParser::Strict::Grammar
50
58
  # cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq -c | sort -rn
51
59
  # Finally, you can generate the full list using this command:
52
60
  # cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq | sed "s/^/'/;s/$/'i \//"
61
+ #
62
+ # Keep this list in sync with {FoodIngredientParser::Loose::Scanner#ABBREVS}.
63
+ # too bad we can't use a shared array for this - https://groups.google.com/d/msg/treetop-dev/f3NveVHi7Aw/0uUogmLMb8wJ
53
64
  (
54
65
  'a.o.p'i /
55
66
  'b.g.a'i /
@@ -5,14 +5,14 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include IngredientSimple
6
6
 
7
7
  rule ingredient_nested
8
- ( ing:ingredient_simple ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
9
- ( ing:ingredient_simple ws* '(' contains:ingredient_nested_in ws* ')' ws* amount:amount <NestedIngredientNode> ) /
10
- ( ing:ingredient_simple_with_amount ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark <NestedIngredientNode> ) /
11
- ( ing:ingredient_simple_with_amount ws* '(' contains:ingredient_nested_in ws* ')' <NestedIngredientNode> ) /
12
- ( ing:ingredient_simple ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
13
- ( ing:ingredient_simple ws* '[' contains:ingredient_nested_in ws* ']' ws* amount:amount <NestedIngredientNode> ) /
14
- ( ing:ingredient_simple_with_amount ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark <NestedIngredientNode> ) /
15
- ( ing:ingredient_simple_with_amount ws* '[' contains:ingredient_nested_in ws* ']' <NestedIngredientNode> )
8
+ ( ing:ingredient_simple (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
9
+ ( ing:ingredient_simple (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws* amount:amount <NestedIngredientNode> ) /
10
+ ( ing:ingredient_simple_with_amount (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark <NestedIngredientNode> ) /
11
+ ( ing:ingredient_simple_with_amount (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' <NestedIngredientNode> ) /
12
+ ( ing:ingredient_simple (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
13
+ ( ing:ingredient_simple (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws* amount:amount <NestedIngredientNode> ) /
14
+ ( ing:ingredient_simple_with_amount (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark <NestedIngredientNode> ) /
15
+ ( ing:ingredient_simple_with_amount (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' <NestedIngredientNode> )
16
16
  end
17
17
 
18
18
  rule ingredient_nested_in
@@ -5,9 +5,11 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ list_coloned_ingredient? ) <ListNode> /
9
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ list_coloned_ingredient? ) <ListNode> /
10
- contains:( ws* list_coloned_ingredient ) <ListNode>
8
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
10
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
11
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
12
+ contains:( ws* list_coloned_ingredient ) <ListNode>
11
13
  end
12
14
 
13
15
  rule list_coloned_inner_list
@@ -47,7 +47,7 @@ module FoodIngredientParser::Strict
47
47
  h[:name] = name.text_value if respond_to?(:name)
48
48
  h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
49
49
  h[:name] = h[:name] + post.text_value if respond_to?(:post)
50
- h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
50
+ h[:marks] = [mark.text_value] if respond_to?(:mark) && mark.text_value != ''
51
51
  h
52
52
  end
53
53
  end
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.0.0.pre.8'
2
+ VERSION = '1.0.0.pre.9'
3
3
  VERSION_DATE = '2018-09-19'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.8
4
+ version: 1.0.0.pre.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
@@ -48,6 +48,7 @@ files:
48
48
  - lib/food_ingredient_parser/loose/scanner.rb
49
49
  - lib/food_ingredient_parser/loose/transform/amount.rb
50
50
  - lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
51
+ - lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
51
52
  - lib/food_ingredient_parser/strict/grammar.rb
52
53
  - lib/food_ingredient_parser/strict/grammar/amount.treetop
53
54
  - lib/food_ingredient_parser/strict/grammar/common.treetop