RubyGems - food_ingredient_parser - Versions diffs - 1.1.2 → 1.1.3 - Mend

food_ingredient_parser 1.1.2 → 1.1.3

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +15 -1
data/lib/food_ingredient_parser/loose/scanner.rb +17 -6
data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +3 -2
data/lib/food_ingredient_parser/strict/grammar/amount.treetop +2 -0
data/lib/food_ingredient_parser/strict/grammar/common.treetop +12 -4
data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop +1 -1
data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +4 -0
data/lib/food_ingredient_parser/version.rb +2 -2
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 64ad7a10a1480b520602113bbcdfc10ba1daf8b5
-  data.tar.gz: 4068a9edbe1dca908228f38d2795ad63a5cbcf76
+  metadata.gz: 77503a77f269805d23ecb60a7b9d63401063e140
+  data.tar.gz: 412d0c6aab0924371677cb8e2a4201349f4684a8
 SHA512:
-  metadata.gz: 73ce876757b08e1d2cf0b5126e8d024b3728260134c3c4f3fe49fee14793da77ecc48d286165dc0c86e8363f2eddf6081355ac26d38f524371b367f4aa3cee23
-  data.tar.gz: befa97dc0fd4605cd2019a2cf7a39aa15d5dfa15acf118f10e9b28e104bbc9bd5925f28286e780acb142eb73a935dba26657f1c6fe9cb766e7f4f88310d5ce55
+  metadata.gz: 82ba24e9277917326348e769feda731b030a9f6477f811f067b1ff1377e784d206adc56407a607e5b3498ddbbfcc8751c7ef6b3583490f59fa56771bdd9f0a80
+  data.tar.gz: '0768160047a661cad810a4fda0c7922e03b039bd17f4aa7b2022722cb636080fa01690a731da8d088861c6981163803b41d2bd577093f4b0a91250d4fe68c03d'

data/README.md CHANGED

@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
 to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
 upgrade to `1.1`.
+## Languages
+While most of the parsing is language-independent, some parts need knowledge about certain words
+(like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
+plus a bit of English and German. Support for other languages is already good, but lacks in certain
+areas: improvements are welcome (starting with a corpus in [data/](data/)).
+Many ingredient lists from the USA are structured a bit differently than those from Europe, they
+parse less well (that that's a matter of tine-tuning).
 ## Test data
-[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
+[`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
 real-world ingredient lists found on the Dutch market. Each line contains one ingredient
 list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
 The strict parser currently parses 80%, while the loose parser returns something for all of them.
+## License
+This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).

data/lib/food_ingredient_parser/loose/scanner.rb CHANGED

@@ -8,12 +8,23 @@ module FoodIngredientParser::Loose
     PREFIX_RE  = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
     NOTE_RE    = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
     # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
-    ABBREV_RE  = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
-      a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
-      i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
-      p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
-      min max ca
-    ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
+    ABBREV_RE  = Regexp.union(
+      /\A(
+        N°\b |
+        °C\b |
+        (ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
+        L\(\+\)-[[:alnum:]]+\b |
+        type\s+"\d+" |
+        L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae
+        E-e?\d{3}[a-z]?\s*\(i+\)
+      )/xi,
+      *%w[
+        a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
+        i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
+        p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
+        min max ca
+      ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
+    ).freeze
     def initialize(s, index: 0)
       @s = s                           # input string

data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb CHANGED

@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
       #
       # @note mark and amount is lost, this is not expected on e-numbers
-      SPLIT_RE = /\s*-\s*/.freeze
-      MATCH_RE = /\A\s*(e[0-9]{3}[a-z]?)(?:#{SPLIT_RE}(e[0-9]{3}[a-z]?))+\s*\z/i.freeze
+      SPLIT_RE  = /\s*-\s*/.freeze
+      SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
+      MATCH_RE  = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
       def self.transform!(node)
         new(node).transform!

data/lib/food_ingredient_parser/strict/grammar/amount.treetop CHANGED

@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
       ) ws* )?
       amount_simple_quantity
       ( ws+ (
+        'of'i / 'or less of'i / 'or more of'i /
         'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
         'min.'i / 'min'i / 'max.'i / 'max'i
       ) )?
@@ -32,6 +33,7 @@ module FoodIngredientParser::Strict::Grammar
     rule amount_simple_unit
       ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
+      ( ws 'vol'i ( !char / '.' ) )?
     end
   end
 end

data/lib/food_ingredient_parser/strict/grammar/common.treetop CHANGED

@@ -48,7 +48,7 @@ module FoodIngredientParser::Strict::Grammar
     end
     rule and
-      ( 'and' / 'en' / 'und' / '&' ) !char
+      ( 'and' / 'en' / 'und' ) !char / '&'
     end
     rule abbrev
@@ -105,8 +105,12 @@ module FoodIngredientParser::Strict::Grammar
         'w.o'i /
         'w.v'i /
         # not auto-generated additions
-        'vit'i /
-        'denat'i
+        'vit'i /   # vitamin
+        'denat'i / # denaturated
+        'alc'i /   # alcohol
+        'vol'i /   # volume
+        'conc'i /  # concentration
+        'subsp'i   # subspecies
       )
       '.'? ![[:alpha:]]
     end
@@ -116,7 +120,11 @@ module FoodIngredientParser::Strict::Grammar
       (
         'N°'i /
         '°C'i /
-        ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
+        ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
+        'L(+)-' [[:alnum:]]+ /
+        'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
+        'type'i ws+ '"' [0-9]+ '"' /
+        'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? ws* '(' 'i'i+ ')' # e.g. "E450 (iii)"
       ) ![[:alpha:]]
     end
   end

data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop CHANGED

@@ -21,7 +21,7 @@ module FoodIngredientParser::Strict::Grammar
     rule ingredient_nested_contains
       'contains'i /
-      'bevat'i
+      'bevat'i / 'bevat o.a.'i / 'o.a.'i / 'met'i
     end
   end

data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop CHANGED

@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
     include Ingredient
     rule list_coloned
+      contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* '.,')+                             ) <ListNode> /
       contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
       contains:( ( ws* list_coloned_ingredient ws* '.' )+                             ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* ';,')+                             ) <ListNode> /
       contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
       contains:( ( ws* list_coloned_ingredient ws* ';' )+                             ) <ListNode> /
       contains:(   ws* list_coloned_ingredient )                                        <ListNode>

data/lib/food_ingredient_parser/version.rb CHANGED

@@ -1,4 +1,4 @@
 module FoodIngredientParser
-  VERSION      = '1.1.2'
-  VERSION_DATE = '2018-09-28'
+  VERSION      = '1.1.3'
+  VERSION_DATE = '2018-10-12'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: food_ingredient_parser
 version: !ruby/object:Gem::Version
-  version: 1.1.2
+  version: 1.1.3
 platform: ruby
 authors:
 - wvengen
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-09-28 00:00:00.000000000 Z
+date: 2018-10-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: treetop