food_ingredient_parser 1.1.2 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 64ad7a10a1480b520602113bbcdfc10ba1daf8b5
4
- data.tar.gz: 4068a9edbe1dca908228f38d2795ad63a5cbcf76
3
+ metadata.gz: 77503a77f269805d23ecb60a7b9d63401063e140
4
+ data.tar.gz: 412d0c6aab0924371677cb8e2a4201349f4684a8
5
5
  SHA512:
6
- metadata.gz: 73ce876757b08e1d2cf0b5126e8d024b3728260134c3c4f3fe49fee14793da77ecc48d286165dc0c86e8363f2eddf6081355ac26d38f524371b367f4aa3cee23
7
- data.tar.gz: befa97dc0fd4605cd2019a2cf7a39aa15d5dfa15acf118f10e9b28e104bbc9bd5925f28286e780acb142eb73a935dba26657f1c6fe9cb766e7f4f88310d5ce55
6
+ metadata.gz: 82ba24e9277917326348e769feda731b030a9f6477f811f067b1ff1377e784d206adc56407a607e5b3498ddbbfcc8751c7ef6b3583490f59fa56771bdd9f0a80
7
+ data.tar.gz: '0768160047a661cad810a4fda0c7922e03b039bd17f4aa7b2022722cb636080fa01690a731da8d088861c6981163803b41d2bd577093f4b0a91250d4fe68c03d'
data/README.md CHANGED
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
185
185
  to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
186
186
  upgrade to `1.1`.
187
187
 
188
+ ## Languages
189
+
190
+ While most of the parsing is language-independent, some parts need knowledge about certain words
191
+ (like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
192
+ plus a bit of English and German. Support for other languages is already good, but lacks in certain
193
+ areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
+
195
+ Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
+ parse less well (that that's a matter of tine-tuning).
197
+
188
198
  ## Test data
189
199
 
190
- [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
200
+ [`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
191
201
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
192
202
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
193
203
  The strict parser currently parses 80%, while the loose parser returns something for all of them.
204
+
205
+ ## License
206
+
207
+ This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
@@ -8,12 +8,23 @@ module FoodIngredientParser::Loose
8
8
  PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
9
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
10
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
- ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
12
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
13
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
14
- p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
15
- min max ca
16
- ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
11
+ ABBREV_RE = Regexp.union(
12
+ /\A(
13
+ N°\b |
14
+ °C\b |
15
+ (ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
16
+ L\(\+\)-[[:alnum:]]+\b |
17
+ type\s+"\d+" |
18
+ L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae
19
+ E-e?\d{3}[a-z]?\s*\(i+\)
20
+ )/xi,
21
+ *%w[
22
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
23
+ i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
24
+ p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
25
+ min max ca
26
+ ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
27
+ ).freeze
17
28
 
18
29
  def initialize(s, index: 0)
19
30
  @s = s # input string
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- MATCH_RE = /\A\s*(e[0-9]{3}[a-z]?)(?:#{SPLIT_RE}(e[0-9]{3}[a-z]?))+\s*\z/i.freeze
10
+ SPLIT_RE = /\s*-\s*/.freeze
11
+ SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
12
+ MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
12
13
 
13
14
  def self.transform!(node)
14
15
  new(node).transform!
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
17
17
  ) ws* )?
18
18
  amount_simple_quantity
19
19
  ( ws+ (
20
+ 'of'i / 'or less of'i / 'or more of'i /
20
21
  'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
21
22
  'min.'i / 'min'i / 'max.'i / 'max'i
22
23
  ) )?
@@ -32,6 +33,7 @@ module FoodIngredientParser::Strict::Grammar
32
33
 
33
34
  rule amount_simple_unit
34
35
  ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
36
+ ( ws 'vol'i ( !char / '.' ) )?
35
37
  end
36
38
  end
37
39
  end
@@ -48,7 +48,7 @@ module FoodIngredientParser::Strict::Grammar
48
48
  end
49
49
 
50
50
  rule and
51
- ( 'and' / 'en' / 'und' / '&' ) !char
51
+ ( 'and' / 'en' / 'und' ) !char / '&'
52
52
  end
53
53
 
54
54
  rule abbrev
@@ -105,8 +105,12 @@ module FoodIngredientParser::Strict::Grammar
105
105
  'w.o'i /
106
106
  'w.v'i /
107
107
  # not auto-generated additions
108
- 'vit'i /
109
- 'denat'i
108
+ 'vit'i / # vitamin
109
+ 'denat'i / # denaturated
110
+ 'alc'i / # alcohol
111
+ 'vol'i / # volume
112
+ 'conc'i / # concentration
113
+ 'subsp'i # subspecies
110
114
  )
111
115
  '.'? ![[:alpha:]]
112
116
  end
@@ -116,7 +120,11 @@ module FoodIngredientParser::Strict::Grammar
116
120
  (
117
121
  'N°'i /
118
122
  '°C'i /
119
- ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
123
+ ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
124
+ 'L(+)-' [[:alnum:]]+ /
125
+ 'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
126
+ 'type'i ws+ '"' [0-9]+ '"' /
127
+ 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? ws* '(' 'i'i+ ')' # e.g. "E450 (iii)"
120
128
  ) ![[:alpha:]]
121
129
  end
122
130
  end
@@ -21,7 +21,7 @@ module FoodIngredientParser::Strict::Grammar
21
21
 
22
22
  rule ingredient_nested_contains
23
23
  'contains'i /
24
- 'bevat'i
24
+ 'bevat'i / 'bevat o.a.'i / 'o.a.'i / 'met'i
25
25
  end
26
26
 
27
27
  end
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
8
10
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
9
11
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
10
14
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
11
15
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
12
16
  contains:( ws* list_coloned_ingredient ) <ListNode>
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.2'
3
- VERSION_DATE = '2018-09-28'
2
+ VERSION = '1.1.3'
3
+ VERSION_DATE = '2018-10-12'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-28 00:00:00.000000000 Z
11
+ date: 2018-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop