food_ingredient_parser 1.1.10 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/food_ingredient_parser/loose/node.rb +11 -4
- data/lib/food_ingredient_parser/loose/scanner.rb +38 -14
- data/lib/food_ingredient_parser/loose/transform/amount.rb +17 -9
- data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +2 -1
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +17 -11
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +1 -0
- data/lib/food_ingredient_parser/strict/grammar/ingredient.treetop +6 -1
- data/lib/food_ingredient_parser/strict/grammar/list.treetop +7 -5
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +1 -0
- data/lib/food_ingredient_parser/strict/grammar/root.treetop +3 -2
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: deb4cb55b3d5c41f02171e031fd11cc996cf2e8df9f074aa163efdff58baa6b0
|
4
|
+
data.tar.gz: 63dc1b52a15e6f70114cca9ed5d8a585a1f475d70131e4852310cf8755558dca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cebae488578f1e00f8d905f34d39cef653cdcb4922d26878687afb3463ae3c24ca6592a6f19ac482b5a2f08e95feef342c9631da69acb99feea4e1a81269057
|
7
|
+
data.tar.gz: a7c8b98a5c3fd3aee8962e8f31cd9a0ede791e8d7c7193bfbb1ba2524be057bd7b38499973cd6ab6fea58020161ba6f8615dc5eef2191a87d45afec4662fa264
|
data/README.md
CHANGED
@@ -104,7 +104,7 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
|
|
104
104
|
SyntaxNode offset=6, ""
|
105
105
|
{:contains=>[{:name=>"tomato"}]}
|
106
106
|
|
107
|
-
$
|
107
|
+
$ food_ingredient_parser --html -s "tomato"
|
108
108
|
<div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
|
109
109
|
|
110
110
|
$ food_ingredient_parser -v -r loose -s "tomato"
|
@@ -197,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
|
|
197
197
|
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
198
198
|
|
199
199
|
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
200
|
-
parse less well (that
|
200
|
+
parse less well (that is probably a matter of tine-tuning).
|
201
201
|
|
202
202
|
## Test data
|
203
203
|
|
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
|
|
5
5
|
class Node
|
6
6
|
include ToHtml
|
7
7
|
|
8
|
-
attr_accessor :
|
8
|
+
attr_accessor :name_parts, :mark, :amount, :contains, :notes
|
9
9
|
attr_reader :input, :interval, :auto_close
|
10
10
|
|
11
11
|
def initialize(input, interval, auto_close: false)
|
@@ -14,7 +14,8 @@ module FoodIngredientParser::Loose
|
|
14
14
|
@auto_close = auto_close
|
15
15
|
@contains = []
|
16
16
|
@notes = []
|
17
|
-
@
|
17
|
+
@name_parts = []
|
18
|
+
@mark = @amount = nil
|
18
19
|
end
|
19
20
|
|
20
21
|
def ends(index)
|
@@ -31,7 +32,8 @@ module FoodIngredientParser::Loose
|
|
31
32
|
|
32
33
|
def to_h
|
33
34
|
r = {}
|
34
|
-
|
35
|
+
_name = name
|
36
|
+
r[:name] = _name if _name
|
35
37
|
r[:marks] = [mark.text_value.strip] if mark
|
36
38
|
r[:amount] = amount.text_value.strip if amount
|
37
39
|
r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
|
@@ -39,6 +41,11 @@ module FoodIngredientParser::Loose
|
|
39
41
|
r
|
40
42
|
end
|
41
43
|
|
44
|
+
def name
|
45
|
+
strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
|
46
|
+
return strings.any? ? strings.join(" ") : nil
|
47
|
+
end
|
48
|
+
|
42
49
|
def inspect(indent="", variant="")
|
43
50
|
inspect_self(indent, variant) +
|
44
51
|
inspect_children(indent)
|
@@ -47,7 +54,7 @@ module FoodIngredientParser::Loose
|
|
47
54
|
def inspect_self(indent="", variant="")
|
48
55
|
[
|
49
56
|
indent + "Node#{variant} interval=#{@interval}",
|
50
|
-
name ? "name=#{name.
|
57
|
+
name ? "name=#{name.inspect}" : nil,
|
51
58
|
mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
|
52
59
|
amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
|
53
60
|
auto_close ? "auto_close" : nil
|
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
|
|
4
4
|
class Scanner
|
5
5
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
|
+
AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
|
7
8
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
8
|
-
PREFIX_RE = /\A\s*(ingredients
|
9
|
+
PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
10
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
10
11
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
12
|
ABBREV_RE = Regexp.union(
|
@@ -23,8 +24,8 @@ module FoodIngredientParser::Loose
|
|
23
24
|
www\.[-_\/:%.A-Za-z0-9]+
|
24
25
|
)/xi,
|
25
26
|
*%w[
|
26
|
-
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s
|
27
|
-
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
27
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
|
28
|
+
i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
28
29
|
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
29
30
|
min max ca
|
30
31
|
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
@@ -32,8 +33,9 @@ module FoodIngredientParser::Loose
|
|
32
33
|
|
33
34
|
def initialize(s, index: 0)
|
34
35
|
@s = s # input string
|
35
|
-
@i = index # current index in string
|
36
|
+
@i = index # current index in string, the iterator looks at this character
|
36
37
|
@cur = nil # current node we're populating
|
38
|
+
@curifree = nil # last index in string for current node that we haven't added to a child node yet
|
37
39
|
@ancestors = [Node.new(@s, @i)] # nesting hierarchy
|
38
40
|
@iterator = :beginning # scan_iteration_<iterator> to use for parsing
|
39
41
|
@dest = :contains # append current node to this attribute on parent
|
@@ -75,6 +77,12 @@ module FoodIngredientParser::Loose
|
|
75
77
|
elsif ")]".include?(c) # close nesting
|
76
78
|
add_child
|
77
79
|
close_parent
|
80
|
+
# after bracket check for 'and' to not lose text
|
81
|
+
if is_and_sep?(@i+1)
|
82
|
+
@i += and_sep_len(@i+1)
|
83
|
+
@curifree = @i # don't include 'and' in cur name
|
84
|
+
add_child
|
85
|
+
end
|
78
86
|
elsif is_notes_start? # usually a dot marks the start of notes
|
79
87
|
close_all_ancestors
|
80
88
|
@iterator = :notes
|
@@ -141,13 +149,26 @@ module FoodIngredientParser::Loose
|
|
141
149
|
end
|
142
150
|
|
143
151
|
def cur
|
144
|
-
|
152
|
+
if !@cur
|
153
|
+
@cur ||= Node.new(@s, @i)
|
154
|
+
@curifree = @i
|
155
|
+
end
|
156
|
+
@cur
|
145
157
|
end
|
146
158
|
|
147
159
|
def is_sep?(chars: SEP_CHARS)
|
148
160
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
149
161
|
end
|
150
162
|
|
163
|
+
def is_and_sep?(i = @i)
|
164
|
+
and_sep_len(i) > 0
|
165
|
+
end
|
166
|
+
|
167
|
+
def and_sep_len(i = @i)
|
168
|
+
m = @s[i..-1].match(AND_SEP_RE)
|
169
|
+
m ? m.offset(0).last : 0
|
170
|
+
end
|
171
|
+
|
151
172
|
def is_mark?(i = @i)
|
152
173
|
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
153
174
|
end
|
@@ -186,16 +207,19 @@ module FoodIngredientParser::Loose
|
|
186
207
|
cur.ends(@i-1)
|
187
208
|
parent.send(@dest) << cur
|
188
209
|
@cur = nil
|
210
|
+
@curifree = nil
|
189
211
|
end
|
190
212
|
|
191
213
|
def open_parent(**options)
|
192
214
|
name_until_here
|
193
215
|
@ancestors << cur
|
194
216
|
@cur = Node.new(@s, @i + 1, **options)
|
217
|
+
@curifree = @i + 1
|
195
218
|
end
|
196
219
|
|
197
220
|
def close_parent
|
198
221
|
return unless @ancestors.count > 1
|
222
|
+
@curifree = @i + 1
|
199
223
|
@cur = @ancestors.pop
|
200
224
|
while @cur.auto_close
|
201
225
|
add_child
|
@@ -212,15 +236,15 @@ module FoodIngredientParser::Loose
|
|
212
236
|
end
|
213
237
|
|
214
238
|
def name_until_here
|
215
|
-
cur
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
239
|
+
return unless @curifree # no cur started yet
|
240
|
+
i, j = @curifree, @i - 1
|
241
|
+
i += mark_len(i) # skip any mark in front
|
242
|
+
# Set name if there is any. There is one corner-case that needs to be avoided when
|
243
|
+
# a nesting was opened without a name, which would set the name to the nesting text.
|
244
|
+
# In this case, the name starts with an open-nesting symbol, which should never happen.
|
245
|
+
if j >= i && !"([:".include?(@s[i])
|
246
|
+
cur.name_parts << Node.new(@s, i .. j)
|
247
|
+
@curifree = @i
|
224
248
|
end
|
225
249
|
end
|
226
250
|
|
@@ -29,18 +29,26 @@ module FoodIngredientParser::Loose
|
|
29
29
|
|
30
30
|
# Extract amount from name, if any.
|
31
31
|
def transform_name(node = @node)
|
32
|
-
if !node.amount
|
33
|
-
|
32
|
+
if !node.amount
|
33
|
+
node.name_parts.each_with_index do |name, i|
|
34
|
+
parsed = parse_amount(name.text_value)
|
35
|
+
next unless parsed
|
36
|
+
offset = name.interval.first
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
amount = parsed.amount.amount
|
39
|
+
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
|
37
40
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
name = parsed.respond_to?(:name) && parsed.name
|
42
|
+
node.name_parts[i] = if name && name.interval.count > 0
|
43
|
+
Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
|
44
|
+
else
|
45
|
+
nil
|
46
|
+
end
|
47
|
+
# found an amount, stop looking in other parts
|
48
|
+
break
|
43
49
|
end
|
50
|
+
# remove cleared name parts
|
51
|
+
node.name_parts.reject!(&:nil?)
|
44
52
|
end
|
45
53
|
|
46
54
|
# recursively transform contained nodes
|
@@ -42,7 +42,8 @@ module FoodIngredientParser::Loose
|
|
42
42
|
# Apply recursively. Do it before processing to handle multiple depth levels of missing names.
|
43
43
|
transform_children!(child) if child.contains.any?
|
44
44
|
|
45
|
-
|
45
|
+
name = child.name
|
46
|
+
if name.nil? || name == ''
|
46
47
|
# Name is empty, we need to do something.
|
47
48
|
if prev
|
48
49
|
# there is a previous ingredient: move children to new parent
|
@@ -29,21 +29,27 @@ module FoodIngredientParser::Loose
|
|
29
29
|
def transform_node!(node)
|
30
30
|
if node.contains.any?
|
31
31
|
node.contains.each {|n| transform_node!(n) }
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
else
|
33
|
+
node.name_parts.each_with_index do |name, name_index|
|
34
|
+
if m = MATCH_RE.match(name.text_value)
|
35
|
+
i = 0
|
36
|
+
while m = name.text_value.match(SPLIT_RE, i)
|
37
|
+
node.contains << new_node(name, i, m.begin(0)-1)
|
38
|
+
i = m.end(0)
|
39
|
+
end
|
40
|
+
node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
|
41
|
+
node.name_parts[name_index] = nil
|
42
|
+
end
|
37
43
|
end
|
38
|
-
|
39
|
-
node.
|
44
|
+
# remove cleared name parts
|
45
|
+
node.name_parts.reject!(&:nil?)
|
40
46
|
end
|
41
47
|
end
|
42
48
|
|
43
|
-
def new_node(
|
44
|
-
offset =
|
45
|
-
new_node = Node.new(
|
46
|
-
new_node.
|
49
|
+
def new_node(name, begins, ends)
|
50
|
+
offset = name.interval.first
|
51
|
+
new_node = Node.new(name.input, offset + begins .. offset + ends)
|
52
|
+
new_node.name_parts = [Node.new(name.input, new_node.interval)]
|
47
53
|
new_node
|
48
54
|
end
|
49
55
|
end
|
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include IngredientColoned
|
6
6
|
|
7
7
|
rule ingredient
|
8
|
-
ws*
|
8
|
+
ws*
|
9
|
+
(
|
10
|
+
ingredient_nested ( ws* and ws+ ingredient )? /
|
11
|
+
ingredient_coloned /
|
12
|
+
ingredient_simple_with_amount
|
13
|
+
)
|
9
14
|
end
|
10
15
|
|
11
16
|
end
|
@@ -4,11 +4,13 @@ module FoodIngredientParser::Strict::Grammar
|
|
4
4
|
include Ingredient
|
5
5
|
|
6
6
|
rule list
|
7
|
-
contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
8
|
-
contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
9
|
-
contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
10
|
-
contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
11
|
-
contains:(
|
7
|
+
contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
8
|
+
contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
9
|
+
contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
10
|
+
contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
11
|
+
contains:(ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
|
12
|
+
contains:(ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
|
13
|
+
contains:(ingredient ( ws+ and ws+ ingredient )? ) <ListNode>
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
17
17
|
end
|
18
18
|
|
19
19
|
rule list_coloned_inner_list
|
20
|
+
contains:( ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
|
20
21
|
contains:( ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
|
21
22
|
contains:( ingredient ( ws* ',' ws* ingredient )* ) <ListNode>
|
22
23
|
end
|
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
|
|
19
19
|
|
20
20
|
rule root_prefix
|
21
21
|
(
|
22
|
-
'ingredients'i / 'contains'i /
|
22
|
+
'ingredients'i ( ws+ 'list'i )? / 'contains'i /
|
23
23
|
('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
|
24
|
-
'zutaten'i
|
24
|
+
'zutaten'i /
|
25
|
+
'ingredienser'i
|
25
26
|
)
|
26
27
|
( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
|
27
28
|
"'"? ws* # stray quote occurs sometimes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
|
-
rubygems_version: 3.
|
90
|
+
rubygems_version: 3.1.6
|
91
91
|
signing_key:
|
92
92
|
specification_version: 4
|
93
93
|
summary: Parser for ingredient lists found on food products.
|