food_ingredient_parser 1.1.10 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a56d22b7e67a3a913b051bcbda8da885ddd467dc53f5a0df0faa5b40759a1f35
4
- data.tar.gz: 427dd79c9f9203dc7901ead6264e08c05183d02aec266ac1d3bff930a5ba1dcd
3
+ metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
4
+ data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
5
5
  SHA512:
6
- metadata.gz: 0b07032ade3a55ce208bcb0c069223b41aee21f185a2b6a9bb91332881dfef8e1d829ae966097e48ffdba9984517be43b10bd027099f9bdce04e3a4c6fc41ca8
7
- data.tar.gz: ebdf452a09d54b151ce8cfa9bb65b4477dd1afc81bfc5cd1d94055f726d387f522dd04a3e11b73d4b26222a18bc1068912a81c8e2e3cd8439b0cee1c1ec290d7
6
+ metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
7
+ data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
data/README.md CHANGED
@@ -104,7 +104,7 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
104
104
  SyntaxNode offset=6, ""
105
105
  {:contains=>[{:name=>"tomato"}]}
106
106
 
107
- $ bin/food_ingredient_parser --html -s "tomato"
107
+ $ food_ingredient_parser --html -s "tomato"
108
108
  <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
109
109
 
110
110
  $ food_ingredient_parser -v -r loose -s "tomato"
@@ -197,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
197
197
  areas: improvements are welcome (starting with a corpus in [data/](data/)).
198
198
 
199
199
  Many ingredient lists from the USA are structured a bit differently than those from Europe, they
200
- parse less well (that that's a matter of tine-tuning).
200
+ parse less well (that is probably a matter of tine-tuning).
201
201
 
202
202
  ## Test data
203
203
 
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
+ AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
7
8
  MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
- PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
+ PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
10
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
11
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
12
  ABBREV_RE = Regexp.union(
@@ -23,8 +24,8 @@ module FoodIngredientParser::Loose
23
24
  www\.[-_\/:%.A-Za-z0-9]+
24
25
  )/xi,
25
26
  *%w[
26
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
27
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
28
+ i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
29
  p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
30
  min max ca
30
31
  ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
75
76
  elsif ")]".include?(c) # close nesting
76
77
  add_child
77
78
  close_parent
79
+ # after bracket check for 'and' to not lose text
80
+ if is_and_sep?(@i+1)
81
+ @i += and_sep_len(@i+1)
82
+ add_child
83
+ end
78
84
  elsif is_notes_start? # usually a dot marks the start of notes
79
85
  close_all_ancestors
80
86
  @iterator = :notes
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
148
154
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
149
155
  end
150
156
 
157
+ def is_and_sep?(i = @i)
158
+ and_sep_len(i) > 0
159
+ end
160
+
161
+ def and_sep_len(i = @i)
162
+ m = @s[i..-1].match(AND_SEP_RE)
163
+ m ? m.offset(0).last : 0
164
+ end
165
+
151
166
  def is_mark?(i = @i)
152
167
  mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
153
168
  end
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
102
102
  'e.u'i /
103
103
  'f.i.l'i /
104
104
  'f.o.s'i /
105
+ 'h.o.h'i /
105
106
  'i.a'i /
106
107
  'i.d'i /
107
108
  'i.e'i /
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include IngredientColoned
6
6
 
7
7
  rule ingredient
8
- ws* ( ingredient_nested / ingredient_coloned / ingredient_simple_with_amount )
8
+ ws*
9
+ (
10
+ ingredient_nested ( ws* and ws+ ingredient )? /
11
+ ingredient_coloned /
12
+ ingredient_simple_with_amount
13
+ )
9
14
  end
10
15
 
11
16
  end
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
19
19
 
20
20
  rule root_prefix
21
21
  (
22
- 'ingredients'i / 'contains'i /
22
+ 'ingredients'i ( ws+ 'list'i )? / 'contains'i /
23
23
  ('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
24
- 'zutaten'i
24
+ 'zutaten'i /
25
+ 'ingredienser'i
25
26
  )
26
27
  ( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
27
28
  "'"? ws* # stray quote occurs sometimes
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.10'
3
- VERSION_DATE = '2021-03-23'
2
+ VERSION = '1.2.0'
3
+ VERSION_DATE = '2024-01-19'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.10
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-23 00:00:00.000000000 Z
11
+ date: 2024-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubygems_version: 3.0.3
90
+ rubygems_version: 3.1.6
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Parser for ingredient lists found on food products.