food_ingredient_parser 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -1
- data/lib/food_ingredient_parser/loose/scanner.rb +17 -6
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +3 -2
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +2 -0
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +12 -4
- data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop +1 -1
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +4 -0
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 77503a77f269805d23ecb60a7b9d63401063e140
|
|
4
|
+
data.tar.gz: 412d0c6aab0924371677cb8e2a4201349f4684a8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 82ba24e9277917326348e769feda731b030a9f6477f811f067b1ff1377e784d206adc56407a607e5b3498ddbbfcc8751c7ef6b3583490f59fa56771bdd9f0a80
|
|
7
|
+
data.tar.gz: '0768160047a661cad810a4fda0c7922e03b039bd17f4aa7b2022722cb636080fa01690a731da8d088861c6981163803b41d2bd577093f4b0a91250d4fe68c03d'
|
data/README.md
CHANGED
|
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
|
|
|
185
185
|
to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
|
|
186
186
|
upgrade to `1.1`.
|
|
187
187
|
|
|
188
|
+
## Languages
|
|
189
|
+
|
|
190
|
+
While most of the parsing is language-independent, some parts need knowledge about certain words
|
|
191
|
+
(like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
|
|
192
|
+
plus a bit of English and German. Support for other languages is already good, but lacks in certain
|
|
193
|
+
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
|
194
|
+
|
|
195
|
+
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
|
196
|
+
parse less well (that that's a matter of tine-tuning).
|
|
197
|
+
|
|
188
198
|
## Test data
|
|
189
199
|
|
|
190
|
-
[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
|
|
200
|
+
[`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
|
|
191
201
|
real-world ingredient lists found on the Dutch market. Each line contains one ingredient
|
|
192
202
|
list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
|
|
193
203
|
The strict parser currently parses 80%, while the loose parser returns something for all of them.
|
|
204
|
+
|
|
205
|
+
## License
|
|
206
|
+
|
|
207
|
+
This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
|
|
@@ -8,12 +8,23 @@ module FoodIngredientParser::Loose
|
|
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
|
9
9
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
|
11
|
-
ABBREV_RE = Regexp.union(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
ABBREV_RE = Regexp.union(
|
|
12
|
+
/\A(
|
|
13
|
+
N°\b |
|
|
14
|
+
°C\b |
|
|
15
|
+
(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
|
|
16
|
+
L\(\+\)-[[:alnum:]]+\b |
|
|
17
|
+
type\s+"\d+" |
|
|
18
|
+
L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae
|
|
19
|
+
E-e?\d{3}[a-z]?\s*\(i+\)
|
|
20
|
+
)/xi,
|
|
21
|
+
*%w[
|
|
22
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
|
23
|
+
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
|
24
|
+
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
|
25
|
+
min max ca
|
|
26
|
+
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
|
27
|
+
).freeze
|
|
17
28
|
|
|
18
29
|
def initialize(s, index: 0)
|
|
19
30
|
@s = s # input string
|
|
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
|
|
|
7
7
|
#
|
|
8
8
|
# @note mark and amount is lost, this is not expected on e-numbers
|
|
9
9
|
|
|
10
|
-
SPLIT_RE
|
|
11
|
-
|
|
10
|
+
SPLIT_RE = /\s*-\s*/.freeze
|
|
11
|
+
SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
|
|
12
|
+
MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
|
|
12
13
|
|
|
13
14
|
def self.transform!(node)
|
|
14
15
|
new(node).transform!
|
|
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
17
17
|
) ws* )?
|
|
18
18
|
amount_simple_quantity
|
|
19
19
|
( ws+ (
|
|
20
|
+
'of'i / 'or less of'i / 'or more of'i /
|
|
20
21
|
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
|
21
22
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
|
22
23
|
) )?
|
|
@@ -32,6 +33,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
32
33
|
|
|
33
34
|
rule amount_simple_unit
|
|
34
35
|
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
|
36
|
+
( ws 'vol'i ( !char / '.' ) )?
|
|
35
37
|
end
|
|
36
38
|
end
|
|
37
39
|
end
|
|
@@ -48,7 +48,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
rule and
|
|
51
|
-
( 'and' / 'en' / 'und' / '&'
|
|
51
|
+
( 'and' / 'en' / 'und' ) !char / '&'
|
|
52
52
|
end
|
|
53
53
|
|
|
54
54
|
rule abbrev
|
|
@@ -105,8 +105,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
105
105
|
'w.o'i /
|
|
106
106
|
'w.v'i /
|
|
107
107
|
# not auto-generated additions
|
|
108
|
-
'vit'i /
|
|
109
|
-
'denat'i
|
|
108
|
+
'vit'i / # vitamin
|
|
109
|
+
'denat'i / # denaturated
|
|
110
|
+
'alc'i / # alcohol
|
|
111
|
+
'vol'i / # volume
|
|
112
|
+
'conc'i / # concentration
|
|
113
|
+
'subsp'i # subspecies
|
|
110
114
|
)
|
|
111
115
|
'.'? ![[:alpha:]]
|
|
112
116
|
end
|
|
@@ -116,7 +120,11 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
116
120
|
(
|
|
117
121
|
'N°'i /
|
|
118
122
|
'°C'i /
|
|
119
|
-
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
|
|
123
|
+
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
|
|
124
|
+
'L(+)-' [[:alnum:]]+ /
|
|
125
|
+
'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
|
|
126
|
+
'type'i ws+ '"' [0-9]+ '"' /
|
|
127
|
+
'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? ws* '(' 'i'i+ ')' # e.g. "E450 (iii)"
|
|
120
128
|
) ![[:alpha:]]
|
|
121
129
|
end
|
|
122
130
|
end
|
|
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
5
5
|
include Ingredient
|
|
6
6
|
|
|
7
7
|
rule list_coloned
|
|
8
|
+
contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
|
|
9
|
+
contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
|
|
8
10
|
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
9
11
|
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
|
|
12
|
+
contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
|
|
13
|
+
contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
|
|
10
14
|
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
11
15
|
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
|
|
12
16
|
contains:( ws* list_coloned_ingredient ) <ListNode>
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: food_ingredient_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- wvengen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-10-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: treetop
|