food_ingredient_parser 1.1.10 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a56d22b7e67a3a913b051bcbda8da885ddd467dc53f5a0df0faa5b40759a1f35
4
- data.tar.gz: 427dd79c9f9203dc7901ead6264e08c05183d02aec266ac1d3bff930a5ba1dcd
3
+ metadata.gz: deb4cb55b3d5c41f02171e031fd11cc996cf2e8df9f074aa163efdff58baa6b0
4
+ data.tar.gz: 63dc1b52a15e6f70114cca9ed5d8a585a1f475d70131e4852310cf8755558dca
5
5
  SHA512:
6
- metadata.gz: 0b07032ade3a55ce208bcb0c069223b41aee21f185a2b6a9bb91332881dfef8e1d829ae966097e48ffdba9984517be43b10bd027099f9bdce04e3a4c6fc41ca8
7
- data.tar.gz: ebdf452a09d54b151ce8cfa9bb65b4477dd1afc81bfc5cd1d94055f726d387f522dd04a3e11b73d4b26222a18bc1068912a81c8e2e3cd8439b0cee1c1ec290d7
6
+ metadata.gz: 1cebae488578f1e00f8d905f34d39cef653cdcb4922d26878687afb3463ae3c24ca6592a6f19ac482b5a2f08e95feef342c9631da69acb99feea4e1a81269057
7
+ data.tar.gz: a7c8b98a5c3fd3aee8962e8f31cd9a0ede791e8d7c7193bfbb1ba2524be057bd7b38499973cd6ab6fea58020161ba6f8615dc5eef2191a87d45afec4662fa264
data/README.md CHANGED
@@ -104,7 +104,7 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
104
104
  SyntaxNode offset=6, ""
105
105
  {:contains=>[{:name=>"tomato"}]}
106
106
 
107
- $ bin/food_ingredient_parser --html -s "tomato"
107
+ $ food_ingredient_parser --html -s "tomato"
108
108
  <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
109
109
 
110
110
  $ food_ingredient_parser -v -r loose -s "tomato"
@@ -197,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
197
197
  areas: improvements are welcome (starting with a corpus in [data/](data/)).
198
198
 
199
199
  Many ingredient lists from the USA are structured a bit differently than those from Europe, they
200
- parse less well (that that's a matter of tine-tuning).
200
+ parse less well (that is probably a matter of tine-tuning).
201
201
 
202
202
  ## Test data
203
203
 
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
5
5
  class Node
6
6
  include ToHtml
7
7
 
8
- attr_accessor :name, :mark, :amount, :contains, :notes
8
+ attr_accessor :name_parts, :mark, :amount, :contains, :notes
9
9
  attr_reader :input, :interval, :auto_close
10
10
 
11
11
  def initialize(input, interval, auto_close: false)
@@ -14,7 +14,8 @@ module FoodIngredientParser::Loose
14
14
  @auto_close = auto_close
15
15
  @contains = []
16
16
  @notes = []
17
- @name = @mark = @amount = nil
17
+ @name_parts = []
18
+ @mark = @amount = nil
18
19
  end
19
20
 
20
21
  def ends(index)
@@ -31,7 +32,8 @@ module FoodIngredientParser::Loose
31
32
 
32
33
  def to_h
33
34
  r = {}
34
- r[:name] = name.text_value.strip if name && name.text_value.strip != ''
35
+ _name = name
36
+ r[:name] = _name if _name
35
37
  r[:marks] = [mark.text_value.strip] if mark
36
38
  r[:amount] = amount.text_value.strip if amount
37
39
  r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
@@ -39,6 +41,11 @@ module FoodIngredientParser::Loose
39
41
  r
40
42
  end
41
43
 
44
+ def name
45
+ strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
46
+ return strings.any? ? strings.join(" ") : nil
47
+ end
48
+
42
49
  def inspect(indent="", variant="")
43
50
  inspect_self(indent, variant) +
44
51
  inspect_children(indent)
@@ -47,7 +54,7 @@ module FoodIngredientParser::Loose
47
54
  def inspect_self(indent="", variant="")
48
55
  [
49
56
  indent + "Node#{variant} interval=#{@interval}",
50
- name ? "name=#{name.text_value.strip.inspect}" : nil,
57
+ name ? "name=#{name.inspect}" : nil,
51
58
  mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
52
59
  amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
53
60
  auto_close ? "auto_close" : nil
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
+ AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
7
8
  MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
- PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
+ PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
10
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
11
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
12
  ABBREV_RE = Regexp.union(
@@ -23,8 +24,8 @@ module FoodIngredientParser::Loose
23
24
  www\.[-_\/:%.A-Za-z0-9]+
24
25
  )/xi,
25
26
  *%w[
26
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
27
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
28
+ i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
29
  p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
30
  min max ca
30
31
  ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
@@ -32,8 +33,9 @@ module FoodIngredientParser::Loose
32
33
 
33
34
  def initialize(s, index: 0)
34
35
  @s = s # input string
35
- @i = index # current index in string
36
+ @i = index # current index in string, the iterator looks at this character
36
37
  @cur = nil # current node we're populating
38
+ @curifree = nil # last index in string for current node that we haven't added to a child node yet
37
39
  @ancestors = [Node.new(@s, @i)] # nesting hierarchy
38
40
  @iterator = :beginning # scan_iteration_<iterator> to use for parsing
39
41
  @dest = :contains # append current node to this attribute on parent
@@ -75,6 +77,12 @@ module FoodIngredientParser::Loose
75
77
  elsif ")]".include?(c) # close nesting
76
78
  add_child
77
79
  close_parent
80
+ # after bracket check for 'and' to not lose text
81
+ if is_and_sep?(@i+1)
82
+ @i += and_sep_len(@i+1)
83
+ @curifree = @i # don't include 'and' in cur name
84
+ add_child
85
+ end
78
86
  elsif is_notes_start? # usually a dot marks the start of notes
79
87
  close_all_ancestors
80
88
  @iterator = :notes
@@ -141,13 +149,26 @@ module FoodIngredientParser::Loose
141
149
  end
142
150
 
143
151
  def cur
144
- @cur ||= Node.new(@s, @i)
152
+ if !@cur
153
+ @cur ||= Node.new(@s, @i)
154
+ @curifree = @i
155
+ end
156
+ @cur
145
157
  end
146
158
 
147
159
  def is_sep?(chars: SEP_CHARS)
148
160
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
149
161
  end
150
162
 
163
+ def is_and_sep?(i = @i)
164
+ and_sep_len(i) > 0
165
+ end
166
+
167
+ def and_sep_len(i = @i)
168
+ m = @s[i..-1].match(AND_SEP_RE)
169
+ m ? m.offset(0).last : 0
170
+ end
171
+
151
172
  def is_mark?(i = @i)
152
173
  mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
153
174
  end
@@ -186,16 +207,19 @@ module FoodIngredientParser::Loose
186
207
  cur.ends(@i-1)
187
208
  parent.send(@dest) << cur
188
209
  @cur = nil
210
+ @curifree = nil
189
211
  end
190
212
 
191
213
  def open_parent(**options)
192
214
  name_until_here
193
215
  @ancestors << cur
194
216
  @cur = Node.new(@s, @i + 1, **options)
217
+ @curifree = @i + 1
195
218
  end
196
219
 
197
220
  def close_parent
198
221
  return unless @ancestors.count > 1
222
+ @curifree = @i + 1
199
223
  @cur = @ancestors.pop
200
224
  while @cur.auto_close
201
225
  add_child
@@ -212,15 +236,15 @@ module FoodIngredientParser::Loose
212
236
  end
213
237
 
214
238
  def name_until_here
215
- cur.name ||= begin
216
- i, j = cur.interval.first, @i - 1
217
- i += mark_len(i) # skip any mark in front
218
- # Set name if there is any. There is one corner-case that needs to be avoided when
219
- # a nesting was opened without a name, which would set the name to the nesting text.
220
- # In this case, the name starts with an open-nesting symbol, which should never happen.
221
- if j >= i && !"([:".include?(@s[i])
222
- Node.new(@s, i .. j)
223
- end
239
+ return unless @curifree # no cur started yet
240
+ i, j = @curifree, @i - 1
241
+ i += mark_len(i) # skip any mark in front
242
+ # Set name if there is any. There is one corner-case that needs to be avoided when
243
+ # a nesting was opened without a name, which would set the name to the nesting text.
244
+ # In this case, the name starts with an open-nesting symbol, which should never happen.
245
+ if j >= i && !"([:".include?(@s[i])
246
+ cur.name_parts << Node.new(@s, i .. j)
247
+ @curifree = @i
224
248
  end
225
249
  end
226
250
 
@@ -29,18 +29,26 @@ module FoodIngredientParser::Loose
29
29
 
30
30
  # Extract amount from name, if any.
31
31
  def transform_name(node = @node)
32
- if !node.amount && parsed = parse_amount(node.name&.text_value)
33
- offset = node.name.interval.first
32
+ if !node.amount
33
+ node.name_parts.each_with_index do |name, i|
34
+ parsed = parse_amount(name.text_value)
35
+ next unless parsed
36
+ offset = name.interval.first
34
37
 
35
- amount = parsed.amount.amount
36
- node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
38
+ amount = parsed.amount.amount
39
+ node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
37
40
 
38
- name = parsed.respond_to?(:name) && parsed.name
39
- if name && name.interval.count > 0
40
- node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
41
- else
42
- node.name = nil
41
+ name = parsed.respond_to?(:name) && parsed.name
42
+ node.name_parts[i] = if name && name.interval.count > 0
43
+ Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
44
+ else
45
+ nil
46
+ end
47
+ # found an amount, stop looking in other parts
48
+ break
43
49
  end
50
+ # remove cleared name parts
51
+ node.name_parts.reject!(&:nil?)
44
52
  end
45
53
 
46
54
  # recursively transform contained nodes
@@ -42,7 +42,8 @@ module FoodIngredientParser::Loose
42
42
  # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
43
43
  transform_children!(child) if child.contains.any?
44
44
 
45
- if child.name.nil? || child.name.text_value.strip == ''
45
+ name = child.name
46
+ if name.nil? || name == ''
46
47
  # Name is empty, we need to do something.
47
48
  if prev
48
49
  # there is a previous ingredient: move children to new parent
@@ -29,21 +29,27 @@ module FoodIngredientParser::Loose
29
29
  def transform_node!(node)
30
30
  if node.contains.any?
31
31
  node.contains.each {|n| transform_node!(n) }
32
- elsif node.name && m = MATCH_RE.match(node.name.text_value)
33
- i = 0
34
- while m = node.name.text_value.match(SPLIT_RE, i)
35
- node.contains << new_node(node, i, m.begin(0)-1)
36
- i = m.end(0)
32
+ else
33
+ node.name_parts.each_with_index do |name, name_index|
34
+ if m = MATCH_RE.match(name.text_value)
35
+ i = 0
36
+ while m = name.text_value.match(SPLIT_RE, i)
37
+ node.contains << new_node(name, i, m.begin(0)-1)
38
+ i = m.end(0)
39
+ end
40
+ node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
41
+ node.name_parts[name_index] = nil
42
+ end
37
43
  end
38
- node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
39
- node.name = nil
44
+ # remove cleared name parts
45
+ node.name_parts.reject!(&:nil?)
40
46
  end
41
47
  end
42
48
 
43
- def new_node(node, begins, ends)
44
- offset = node.name.interval.first
45
- new_node = Node.new(node.input, offset + begins .. offset + ends)
46
- new_node.name = Node.new(node.input, new_node.interval)
49
+ def new_node(name, begins, ends)
50
+ offset = name.interval.first
51
+ new_node = Node.new(name.input, offset + begins .. offset + ends)
52
+ new_node.name_parts = [Node.new(name.input, new_node.interval)]
47
53
  new_node
48
54
  end
49
55
  end
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
102
102
  'e.u'i /
103
103
  'f.i.l'i /
104
104
  'f.o.s'i /
105
+ 'h.o.h'i /
105
106
  'i.a'i /
106
107
  'i.d'i /
107
108
  'i.e'i /
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include IngredientColoned
6
6
 
7
7
  rule ingredient
8
- ws* ( ingredient_nested / ingredient_coloned / ingredient_simple_with_amount )
8
+ ws*
9
+ (
10
+ ingredient_nested ( ws* and ws+ ingredient )? /
11
+ ingredient_coloned /
12
+ ingredient_simple_with_amount
13
+ )
9
14
  end
10
15
 
11
16
  end
@@ -4,11 +4,13 @@ module FoodIngredientParser::Strict::Grammar
4
4
  include Ingredient
5
5
 
6
6
  rule list
7
- contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
8
- contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
9
- contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
10
- contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
11
- contains:(ingredient ( ws+ and ws+ ingredient )? ) <ListNode>
7
+ contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
8
+ contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
9
+ contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
10
+ contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
11
+ contains:(ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
12
+ contains:(ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
13
+ contains:(ingredient ( ws+ and ws+ ingredient )? ) <ListNode>
12
14
  end
13
15
  end
14
16
  end
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
17
17
  end
18
18
 
19
19
  rule list_coloned_inner_list
20
+ contains:( ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
20
21
  contains:( ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
21
22
  contains:( ingredient ( ws* ',' ws* ingredient )* ) <ListNode>
22
23
  end
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
19
19
 
20
20
  rule root_prefix
21
21
  (
22
- 'ingredients'i / 'contains'i /
22
+ 'ingredients'i ( ws+ 'list'i )? / 'contains'i /
23
23
  ('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
24
- 'zutaten'i
24
+ 'zutaten'i /
25
+ 'ingredienser'i
25
26
  )
26
27
  ( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
27
28
  "'"? ws* # stray quote occurs sometimes
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.10'
3
- VERSION_DATE = '2021-03-23'
2
+ VERSION = '1.3.0'
3
+ VERSION_DATE = '2024-06-13'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.10
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-23 00:00:00.000000000 Z
11
+ date: 2024-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubygems_version: 3.0.3
90
+ rubygems_version: 3.1.6
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Parser for ingredient lists found on food products.