food_ingredient_parser 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 64ad7a10a1480b520602113bbcdfc10ba1daf8b5
4
- data.tar.gz: 4068a9edbe1dca908228f38d2795ad63a5cbcf76
3
+ metadata.gz: 77503a77f269805d23ecb60a7b9d63401063e140
4
+ data.tar.gz: 412d0c6aab0924371677cb8e2a4201349f4684a8
5
5
  SHA512:
6
- metadata.gz: 73ce876757b08e1d2cf0b5126e8d024b3728260134c3c4f3fe49fee14793da77ecc48d286165dc0c86e8363f2eddf6081355ac26d38f524371b367f4aa3cee23
7
- data.tar.gz: befa97dc0fd4605cd2019a2cf7a39aa15d5dfa15acf118f10e9b28e104bbc9bd5925f28286e780acb142eb73a935dba26657f1c6fe9cb766e7f4f88310d5ce55
6
+ metadata.gz: 82ba24e9277917326348e769feda731b030a9f6477f811f067b1ff1377e784d206adc56407a607e5b3498ddbbfcc8751c7ef6b3583490f59fa56771bdd9f0a80
7
+ data.tar.gz: '0768160047a661cad810a4fda0c7922e03b039bd17f4aa7b2022722cb636080fa01690a731da8d088861c6981163803b41d2bd577093f4b0a91250d4fe68c03d'
data/README.md CHANGED
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
185
185
  to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
186
186
  upgrade to `1.1`.
187
187
 
188
+ ## Languages
189
+
190
+ While most of the parsing is language-independent, some parts need knowledge about certain words
191
+ (like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
192
+ plus a bit of English and German. Support for other languages is already good, but lacks in certain
193
+ areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
+
195
+ Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
+ parse less well (that that's a matter of tine-tuning).
197
+
188
198
  ## Test data
189
199
 
190
- [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
200
+ [`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
191
201
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
192
202
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
193
203
  The strict parser currently parses 80%, while the loose parser returns something for all of them.
204
+
205
+ ## License
206
+
207
+ This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
@@ -8,12 +8,23 @@ module FoodIngredientParser::Loose
8
8
  PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
9
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
10
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
- ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
12
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
13
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
14
- p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
15
- min max ca
16
- ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
11
+ ABBREV_RE = Regexp.union(
12
+ /\A(
13
+ N°\b |
14
+ °C\b |
15
+ (ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
16
+ L\(\+\)-[[:alnum:]]+\b |
17
+ type\s+"\d+" |
18
+ L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae
19
+ E-e?\d{3}[a-z]?\s*\(i+\)
20
+ )/xi,
21
+ *%w[
22
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
23
+ i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
24
+ p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
25
+ min max ca
26
+ ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
27
+ ).freeze
17
28
 
18
29
  def initialize(s, index: 0)
19
30
  @s = s # input string
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- MATCH_RE = /\A\s*(e[0-9]{3}[a-z]?)(?:#{SPLIT_RE}(e[0-9]{3}[a-z]?))+\s*\z/i.freeze
10
+ SPLIT_RE = /\s*-\s*/.freeze
11
+ SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
12
+ MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
12
13
 
13
14
  def self.transform!(node)
14
15
  new(node).transform!
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
17
17
  ) ws* )?
18
18
  amount_simple_quantity
19
19
  ( ws+ (
20
+ 'of'i / 'or less of'i / 'or more of'i /
20
21
  'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
21
22
  'min.'i / 'min'i / 'max.'i / 'max'i
22
23
  ) )?
@@ -32,6 +33,7 @@ module FoodIngredientParser::Strict::Grammar
32
33
 
33
34
  rule amount_simple_unit
34
35
  ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
36
+ ( ws 'vol'i ( !char / '.' ) )?
35
37
  end
36
38
  end
37
39
  end
@@ -48,7 +48,7 @@ module FoodIngredientParser::Strict::Grammar
48
48
  end
49
49
 
50
50
  rule and
51
- ( 'and' / 'en' / 'und' / '&' ) !char
51
+ ( 'and' / 'en' / 'und' ) !char / '&'
52
52
  end
53
53
 
54
54
  rule abbrev
@@ -105,8 +105,12 @@ module FoodIngredientParser::Strict::Grammar
105
105
  'w.o'i /
106
106
  'w.v'i /
107
107
  # not auto-generated additions
108
- 'vit'i /
109
- 'denat'i
108
+ 'vit'i / # vitamin
109
+ 'denat'i / # denaturated
110
+ 'alc'i / # alcohol
111
+ 'vol'i / # volume
112
+ 'conc'i / # concentration
113
+ 'subsp'i # subspecies
110
114
  )
111
115
  '.'? ![[:alpha:]]
112
116
  end
@@ -116,7 +120,11 @@ module FoodIngredientParser::Strict::Grammar
116
120
  (
117
121
  'N°'i /
118
122
  '°C'i /
119
- ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
123
+ ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
124
+ 'L(+)-' [[:alnum:]]+ /
125
+ 'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
126
+ 'type'i ws+ '"' [0-9]+ '"' /
127
+ 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? ws* '(' 'i'i+ ')' # e.g. "E450 (iii)"
120
128
  ) ![[:alpha:]]
121
129
  end
122
130
  end
@@ -21,7 +21,7 @@ module FoodIngredientParser::Strict::Grammar
21
21
 
22
22
  rule ingredient_nested_contains
23
23
  'contains'i /
24
- 'bevat'i
24
+ 'bevat'i / 'bevat o.a.'i / 'o.a.'i / 'met'i
25
25
  end
26
26
 
27
27
  end
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
8
10
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
9
11
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
10
14
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
11
15
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
12
16
  contains:( ws* list_coloned_ingredient ) <ListNode>
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.2'
3
- VERSION_DATE = '2018-09-28'
2
+ VERSION = '1.1.3'
3
+ VERSION_DATE = '2018-10-12'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-28 00:00:00.000000000 Z
11
+ date: 2018-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop