food_ingredient_parser 1.1.2 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -1
- data/lib/food_ingredient_parser/loose/scanner.rb +17 -6
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +3 -2
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +2 -0
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +12 -4
- data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop +1 -1
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +4 -0
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77503a77f269805d23ecb60a7b9d63401063e140
|
4
|
+
data.tar.gz: 412d0c6aab0924371677cb8e2a4201349f4684a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 82ba24e9277917326348e769feda731b030a9f6477f811f067b1ff1377e784d206adc56407a607e5b3498ddbbfcc8751c7ef6b3583490f59fa56771bdd9f0a80
|
7
|
+
data.tar.gz: '0768160047a661cad810a4fda0c7922e03b039bd17f4aa7b2022722cb636080fa01690a731da8d088861c6981163803b41d2bd577093f4b0a91250d4fe68c03d'
|
data/README.md
CHANGED
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
|
|
185
185
|
to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
|
186
186
|
upgrade to `1.1`.
|
187
187
|
|
188
|
+
## Languages
|
189
|
+
|
190
|
+
While most of the parsing is language-independent, some parts need knowledge about certain words
|
191
|
+
(like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
|
192
|
+
plus a bit of English and German. Support for other languages is already good, but lacks in certain
|
193
|
+
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
194
|
+
|
195
|
+
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
196
|
+
parse less well (that that's a matter of tine-tuning).
|
197
|
+
|
188
198
|
## Test data
|
189
199
|
|
190
|
-
[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
|
200
|
+
[`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
|
191
201
|
real-world ingredient lists found on the Dutch market. Each line contains one ingredient
|
192
202
|
list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
|
193
203
|
The strict parser currently parses 80%, while the loose parser returns something for all of them.
|
204
|
+
|
205
|
+
## License
|
206
|
+
|
207
|
+
This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
|
@@ -8,12 +8,23 @@ module FoodIngredientParser::Loose
|
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
9
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
|
-
ABBREV_RE = Regexp.union(
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
ABBREV_RE = Regexp.union(
|
12
|
+
/\A(
|
13
|
+
N°\b |
|
14
|
+
°C\b |
|
15
|
+
(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
|
16
|
+
L\(\+\)-[[:alnum:]]+\b |
|
17
|
+
type\s+"\d+" |
|
18
|
+
L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae
|
19
|
+
E-e?\d{3}[a-z]?\s*\(i+\)
|
20
|
+
)/xi,
|
21
|
+
*%w[
|
22
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
23
|
+
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
24
|
+
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
25
|
+
min max ca
|
26
|
+
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
27
|
+
).freeze
|
17
28
|
|
18
29
|
def initialize(s, index: 0)
|
19
30
|
@s = s # input string
|
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
|
|
7
7
|
#
|
8
8
|
# @note mark and amount is lost, this is not expected on e-numbers
|
9
9
|
|
10
|
-
SPLIT_RE
|
11
|
-
|
10
|
+
SPLIT_RE = /\s*-\s*/.freeze
|
11
|
+
SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
|
12
|
+
MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
|
12
13
|
|
13
14
|
def self.transform!(node)
|
14
15
|
new(node).transform!
|
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
17
17
|
) ws* )?
|
18
18
|
amount_simple_quantity
|
19
19
|
( ws+ (
|
20
|
+
'of'i / 'or less of'i / 'or more of'i /
|
20
21
|
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
21
22
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
22
23
|
) )?
|
@@ -32,6 +33,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
32
33
|
|
33
34
|
rule amount_simple_unit
|
34
35
|
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
36
|
+
( ws 'vol'i ( !char / '.' ) )?
|
35
37
|
end
|
36
38
|
end
|
37
39
|
end
|
@@ -48,7 +48,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
48
48
|
end
|
49
49
|
|
50
50
|
rule and
|
51
|
-
( 'and' / 'en' / 'und' / '&'
|
51
|
+
( 'and' / 'en' / 'und' ) !char / '&'
|
52
52
|
end
|
53
53
|
|
54
54
|
rule abbrev
|
@@ -105,8 +105,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
105
105
|
'w.o'i /
|
106
106
|
'w.v'i /
|
107
107
|
# not auto-generated additions
|
108
|
-
'vit'i /
|
109
|
-
'denat'i
|
108
|
+
'vit'i / # vitamin
|
109
|
+
'denat'i / # denaturated
|
110
|
+
'alc'i / # alcohol
|
111
|
+
'vol'i / # volume
|
112
|
+
'conc'i / # concentration
|
113
|
+
'subsp'i # subspecies
|
110
114
|
)
|
111
115
|
'.'? ![[:alpha:]]
|
112
116
|
end
|
@@ -116,7 +120,11 @@ module FoodIngredientParser::Strict::Grammar
|
|
116
120
|
(
|
117
121
|
'N°'i /
|
118
122
|
'°C'i /
|
119
|
-
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
|
123
|
+
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
|
124
|
+
'L(+)-' [[:alnum:]]+ /
|
125
|
+
'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
|
126
|
+
'type'i ws+ '"' [0-9]+ '"' /
|
127
|
+
'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? ws* '(' 'i'i+ ')' # e.g. "E450 (iii)"
|
120
128
|
) ![[:alpha:]]
|
121
129
|
end
|
122
130
|
end
|
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include Ingredient
|
6
6
|
|
7
7
|
rule list_coloned
|
8
|
+
contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
|
9
|
+
contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
|
8
10
|
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
|
9
11
|
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
|
12
|
+
contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
|
13
|
+
contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
|
10
14
|
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
|
11
15
|
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
|
12
16
|
contains:( ws* list_coloned_ingredient ) <ListNode>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|