food_ingredient_parser 1.1.10 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/food_ingredient_parser/loose/scanner.rb +18 -3
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +1 -0
- data/lib/food_ingredient_parser/strict/grammar/ingredient.treetop +6 -1
- data/lib/food_ingredient_parser/strict/grammar/root.treetop +3 -2
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
|
|
4
|
+
data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
|
|
7
|
+
data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
|
data/README.md
CHANGED
|
@@ -104,7 +104,7 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
|
|
|
104
104
|
SyntaxNode offset=6, ""
|
|
105
105
|
{:contains=>[{:name=>"tomato"}]}
|
|
106
106
|
|
|
107
|
-
$
|
|
107
|
+
$ food_ingredient_parser --html -s "tomato"
|
|
108
108
|
<div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
|
|
109
109
|
|
|
110
110
|
$ food_ingredient_parser -v -r loose -s "tomato"
|
|
@@ -197,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
|
|
|
197
197
|
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
|
198
198
|
|
|
199
199
|
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
|
200
|
-
parse less well (that
|
|
200
|
+
parse less well (that is probably a matter of tine-tuning).
|
|
201
201
|
|
|
202
202
|
## Test data
|
|
203
203
|
|
|
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
|
|
|
4
4
|
class Scanner
|
|
5
5
|
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
|
7
|
+
AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
|
|
7
8
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
|
8
|
-
PREFIX_RE = /\A\s*(ingredients
|
|
9
|
+
PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
|
9
10
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
|
10
11
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
|
11
12
|
ABBREV_RE = Regexp.union(
|
|
@@ -23,8 +24,8 @@ module FoodIngredientParser::Loose
|
|
|
23
24
|
www\.[-_\/:%.A-Za-z0-9]+
|
|
24
25
|
)/xi,
|
|
25
26
|
*%w[
|
|
26
|
-
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s
|
|
27
|
-
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
|
27
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
|
|
28
|
+
i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
|
28
29
|
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
|
29
30
|
min max ca
|
|
30
31
|
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
|
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
|
|
|
75
76
|
elsif ")]".include?(c) # close nesting
|
|
76
77
|
add_child
|
|
77
78
|
close_parent
|
|
79
|
+
# after bracket check for 'and' to not lose text
|
|
80
|
+
if is_and_sep?(@i+1)
|
|
81
|
+
@i += and_sep_len(@i+1)
|
|
82
|
+
add_child
|
|
83
|
+
end
|
|
78
84
|
elsif is_notes_start? # usually a dot marks the start of notes
|
|
79
85
|
close_all_ancestors
|
|
80
86
|
@iterator = :notes
|
|
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
|
|
|
148
154
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
|
149
155
|
end
|
|
150
156
|
|
|
157
|
+
def is_and_sep?(i = @i)
|
|
158
|
+
and_sep_len(i) > 0
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def and_sep_len(i = @i)
|
|
162
|
+
m = @s[i..-1].match(AND_SEP_RE)
|
|
163
|
+
m ? m.offset(0).last : 0
|
|
164
|
+
end
|
|
165
|
+
|
|
151
166
|
def is_mark?(i = @i)
|
|
152
167
|
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
|
153
168
|
end
|
|
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
5
5
|
include IngredientColoned
|
|
6
6
|
|
|
7
7
|
rule ingredient
|
|
8
|
-
ws*
|
|
8
|
+
ws*
|
|
9
|
+
(
|
|
10
|
+
ingredient_nested ( ws* and ws+ ingredient )? /
|
|
11
|
+
ingredient_coloned /
|
|
12
|
+
ingredient_simple_with_amount
|
|
13
|
+
)
|
|
9
14
|
end
|
|
10
15
|
|
|
11
16
|
end
|
|
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
19
19
|
|
|
20
20
|
rule root_prefix
|
|
21
21
|
(
|
|
22
|
-
'ingredients'i / 'contains'i /
|
|
22
|
+
'ingredients'i ( ws+ 'list'i )? / 'contains'i /
|
|
23
23
|
('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
|
|
24
|
-
'zutaten'i
|
|
24
|
+
'zutaten'i /
|
|
25
|
+
'ingredienser'i
|
|
25
26
|
)
|
|
26
27
|
( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
|
|
27
28
|
"'"? ws* # stray quote occurs sometimes
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: food_ingredient_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- wvengen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-01-19 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: treetop
|
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
87
87
|
- !ruby/object:Gem::Version
|
|
88
88
|
version: '0'
|
|
89
89
|
requirements: []
|
|
90
|
-
rubygems_version: 3.
|
|
90
|
+
rubygems_version: 3.1.6
|
|
91
91
|
signing_key:
|
|
92
92
|
specification_version: 4
|
|
93
93
|
summary: Parser for ingredient lists found on food products.
|