RubyGems - food_ingredient_parser - Versions diffs - 1.0.0.pre.8 → 1.0.0.pre.9 - Mend

food_ingredient_parser 1.0.0.pre.8 → 1.0.0.pre.9

Files changed (13) hide show

checksums.yaml +4 -4
data/README.md +4 -3
data/lib/food_ingredient_parser/loose/node.rb +1 -1
data/lib/food_ingredient_parser/loose/parser.rb +4 -1
data/lib/food_ingredient_parser/loose/scanner.rb +31 -9
data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +64 -0
data/lib/food_ingredient_parser/strict/grammar/amount.treetop +16 -10
data/lib/food_ingredient_parser/strict/grammar/common.treetop +11 -0
data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop +8 -8
data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +5 -3
data/lib/food_ingredient_parser/strict/nodes.rb +1 -1
data/lib/food_ingredient_parser/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b8042180a4a4fbc5233a5630c7e0cf8e4751182b
-  data.tar.gz: 26abeaf528a49a6f01a47eb114e35631d9347cc7
+  metadata.gz: f08e6fcc9422b83503d37b41111f3bd540c11909
+  data.tar.gz: 16af14258ae67fa9b03b2b2196da9631d3dc6a9d
 SHA512:
-  metadata.gz: 6d7c7972846a88046760de7d1c5857f426891502fdca4f50c0fad179f2a7580dd0b157aed106e9efa62af013e4a20d6d8e3be9c49ec8e4eeee326cb228e26c91
-  data.tar.gz: 872aadc53b40e991e156fde3bb89db69bebb52ca8993aadfbc1f3c862b76b0d42ac01b377da70af22cf6b909b24b063a16d59f1aab4ed6ddd4917e1196736c50
+  metadata.gz: b1bfd6c713f0117cc8c13f4110624d270cb471096577e2dc47ea43f6f70664c88c9a2bec444bac407f9424b3c147245e16d04b5b0e9bf8442273c8d90c27955c
+  data.tar.gz: 873de15303ea9bebb4ab9d3504851a4b364e1aef849f6c530235476a4d8e65867fe5426dd7b9274ecd99befb2de83746b8854a6ba08a24cb8caac7525973340b

data/README.md CHANGED Viewed

@@ -29,7 +29,7 @@ Results in
 ```ruby
 {
   :contains=>[
-    {:name=>"Water", :amount=>"60%", :mark=>"*"},
+    {:name=>"Water", :amount=>"60%", :marks=>["*"]},
     {:name=>"suiker", :amount=>"30%"},
     {:name=>"voedingszuren", :contains=>[
       {:name=>"citroenzuur"}
@@ -165,8 +165,9 @@ Even though the strict parser would not give a result, the loose parser returns:
 {
   :contains=>[
     {:name=>"Saus", :contains=>[
-      {:name=>"tomaat", :mark=>"*", :amount=>"10%"},
-      {:contains=>[{:name=>"zout"}]},
+      {:name=>"tomaat", :marks=>["*"], :amount=>"10%", {
+        :contains=>[{:name=>"zout"}
+      ]},
       {:name=>"peper"}
     ]}
   ]

data/lib/food_ingredient_parser/loose/node.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module FoodIngredientParser::Loose
     def to_h
       r = {}
       r[:name] = name.text_value.strip if name && name.text_value.strip != ''
-      r[:mark] = mark.text_value.strip if mark
+      r[:marks] = [mark.text_value.strip] if mark
       r[:amount] = amount.text_value.strip if amount
       r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
       r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?

data/lib/food_ingredient_parser/loose/parser.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require_relative '../cleaner'
 require_relative 'scanner'
 require_relative 'transform/amount'
+require_relative 'transform/handle_missing_name'
 module FoodIngredientParser::Loose
   class Parser
@@ -13,11 +14,13 @@ module FoodIngredientParser::Loose
     # Parse food ingredient list text into a structured representation.
     #
     # @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
+    # @option normalize [Boolean] pass +false+ to disable some normalizations (handling missing names)
     # @return [FoodIngredientParser::Loose::Node] structured representation of food ingredients
-    def parse(s, clean: true, **options)
+    def parse(s, clean: true, normalize: true, **options)
       s = FoodIngredientParser::Cleaner.clean(s) if clean
       n = Scanner.new(s).scan
       n = Transform::Amount.transform!(n) if n
+      n = Transform::HandleMissingName.transform!(n) if n && normalize
       n
     end
   end

data/lib/food_ingredient_parser/loose/scanner.rb CHANGED Viewed

@@ -5,8 +5,15 @@ module FoodIngredientParser::Loose
     SEP_CHARS  = "|;,.".freeze
     MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
-    PREFIX_RE  = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\s*[:;.]\s*/i.freeze
+    PREFIX_RE  = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
     NOTE_RE    = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
+    # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
+    ABBREV_RE  = Regexp.union(%w[
+      a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
+      i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
+      p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat N° °C
+      min max ca
+    ].map {|s| /\A#{Regexp.escape(s)}\b\.?/})
     def initialize(s, index: 0)
       @s = s                           # input string
@@ -45,7 +52,10 @@ module FoodIngredientParser::Loose
     end
     def scan_iteration_standard
-      if "([".include?(c)         # open nesting
+      if (len = abbrev_len) > 0   # defer iterations until after any abbreviation
+        cur # reference to record starting position
+        @i += len - 1
+      elsif "([".include?(c)      # open nesting
         open_parent
       elsif ")]".include?(c)      # close nesting
         add_child
@@ -57,8 +67,12 @@ module FoodIngredientParser::Loose
       elsif is_sep?               # separator
         add_child
       elsif ":".include?(c)       # another open nesting
-        open_parent(auto_close: true)
-        @iterator = :colon
+        if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
+          # ignore if before an open bracket, then it's a regular nesting
+        else
+          open_parent(auto_close: true)
+          @iterator = :colon
+        end
       elsif is_mark? && !cur.mark # mark after ingredient
         name_until_here
         len = mark_len
@@ -70,7 +84,10 @@ module FoodIngredientParser::Loose
     end
     def scan_iteration_colon
-      if "/".include?(c)        # slash separator in colon nesting only
+      if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
+        cur # reference to record starting position
+        @i += len - 1
+      elsif "/".include?(c)     # slash separator in colon nesting only
         add_child
       elsif is_sep?             # regular separator indicates end of colon nesting
         add_child
@@ -108,14 +125,14 @@ module FoodIngredientParser::Loose
       @cur ||= Node.new(@s, @i)
     end
-    def is_mark?
-      mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
-    end
     def is_sep?(chars: SEP_CHARS)
       chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
     end
+    def is_mark?
+      mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
+    end
     def mark_len
       i = @i
       while @s[i] && MARK_CHARS.include?(@s[i])
@@ -124,6 +141,11 @@ module FoodIngredientParser::Loose
       i - @i
     end
+    def abbrev_len
+      m = @s[@i .. -1].match(ABBREV_RE)
+      m ? m.offset(0).last : 0
+    end
     def is_notes_start?
       # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
       if ( is_mark? && @s[@i+mark_len..-1] =~ /\A\s*=/ ) ||     # "* = Biologisch"

data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb ADDED Viewed

@@ -0,0 +1,64 @@
+module FoodIngredientParser::Loose
+  module Transform
+    # Transforms node tree to handle missing names.
+    #
+    # The loose parser can return a node tree that has some ingredients without a name.
+    # Usually this means that either the parser wasn't smart enough to understand the input,
+    # or the input was not strictly clear (e.g. a case like "herbs, (oregano), salt" is often seen).
+    #
+    # When a contained node is found which doesn't have a name:
+    # * For the amount (if any): ignore it (as it's often ambiguous which ingredient it belongs to)
+    # * For the marks (if any): ignore it (we might instead add it to the containing ingredients)
+    # * For the containing ingredients (if any):
+    #   - if the previous ingredient is present and doesn't contain ingredients already,
+    #     assume the current contained ingredients are actually part of the previous ingredient.
+    #   - if there is no previous ingredient, assume the nesting is wrong and insert them before
+    #     the other ingredients one depth level above.
+    #   - if there is a previous ingredient which contains ingredients, we can't make much of it,
+    #     to avoid losing them, add them as contained ingredients to the previous ingredient.
+    #
+    class HandleMissingName
+      def self.transform!(node)
+        new(node).transform!
+      end
+      def initialize(node)
+        @node = node
+      end
+      def transform!
+        transform_children!(@node)
+        @node
+      end
+      private
+      def transform_children!(node)
+        prev = nil
+        new_contains = []
+        node.contains.each do |child|
+          # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
+          transform_children!(child) if child.contains.any?
+          if child.name.nil? || child.name.text_value.strip == ''
+            # Name is empty, we need to do something.
+            if prev
+              # there is a previous ingredient: move children to new parent
+              prev.contains.push(*child.contains)
+            else
+              # there is no previous ingredient: move children one level up
+              new_contains.push(*child.contains)
+            end
+          else
+            # Nothing to see here, just leave it as it is.
+            new_contains << child
+          end
+          prev = child
+        end
+        node.contains = new_contains
+      end
+    end
+  end
+end

data/lib/food_ingredient_parser/strict/grammar/amount.treetop CHANGED Viewed

@@ -3,29 +3,35 @@ module FoodIngredientParser::Strict::Grammar
     include Common
     rule amount
-      '(' ws* amount:simple_amount ws* ')' <AmountNode> /
-      '[' ws* amount:simple_amount ws* ']' <AmountNode> /
-      '{' ws* amount:simple_amount ws* '}' <AmountNode> /
-      amount:simple_amount                 <AmountNode>
+      '(' ws* amount:amount_simple ws* ')' <AmountNode> /
+      '[' ws* amount:amount_simple ws* ']' <AmountNode> /
+      '{' ws* amount:amount_simple ws* '}' <AmountNode> /
+      amount:amount_simple                 <AmountNode>
     end
-    rule simple_amount
+    rule amount_simple
       ( (
         'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
         'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i /
-        'min.'i / 'min'i / 'max.'i / 'max'i
+        'min.'i / 'min'i / 'max.'i / 'max'i / 'c.a.'i / 'ca.'i / 'ca'i
       ) ws* )?
-      [±∓~∼∽≂≃≈≲≤<>≥≳]? ws*
-      simple_amount_quantity
+      amount_simple_quantity
       ( ws+ (
         'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
         'min.'i / 'min'i / 'max.'i / 'max'i
       ) )?
     end
-    rule simple_amount_quantity
-      number ( ws* '-' ws* number )? ws* ( [%٪⁒％﹪] / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'g'i ) !char ) )
+    rule amount_simple_quantity
+      amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
+    end
+    rule amount_simple_number
+      ( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
     end
+    rule amount_simple_unit
+      ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
+    end
   end
 end

data/lib/food_ingredient_parser/strict/grammar/common.treetop CHANGED Viewed

@@ -31,10 +31,18 @@ module FoodIngredientParser::Strict::Grammar
       [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
     end
+    rule percent
+      [%٪⁒％﹪]
+    end
     rule number
       digit+ [,.] digit+ / digit+ ws* fraction / fraction / digit+
     end
+    rule dash
+      [-֊ ‐ ‑ ‒ – — ― ﹘﹣－]
+    end
     rule word
       abbrev / char+
     end
@@ -50,6 +58,9 @@ module FoodIngredientParser::Strict::Grammar
       #   cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq -c | sort -rn
       # Finally, you can generate the full list using this command:
       #   cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq | sed "s/^/'/;s/$/'i \//"
+      #
+      # Keep this list in sync with {FoodIngredientParser::Loose::Scanner#ABBREVS}.
+      # too bad we can't use a shared array for this - https://groups.google.com/d/msg/treetop-dev/f3NveVHi7Aw/0uUogmLMb8wJ
       (
         'a.o.p'i /
         'b.g.a'i /

data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop CHANGED Viewed

@@ -5,14 +5,14 @@ module FoodIngredientParser::Strict::Grammar
     include IngredientSimple
     rule ingredient_nested
-      ( ing:ingredient_simple              ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
-      ( ing:ingredient_simple              ws* '(' contains:ingredient_nested_in ws* ')'               ws* amount:amount <NestedIngredientNode> ) /
-      ( ing:ingredient_simple_with_amount  ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark                   <NestedIngredientNode> ) /
-      ( ing:ingredient_simple_with_amount  ws* '(' contains:ingredient_nested_in ws* ')'                                 <NestedIngredientNode> ) /
-      ( ing:ingredient_simple              ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
-      ( ing:ingredient_simple              ws* '[' contains:ingredient_nested_in ws* ']'               ws* amount:amount <NestedIngredientNode> ) /
-      ( ing:ingredient_simple_with_amount  ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark                   <NestedIngredientNode> ) /
-      ( ing:ingredient_simple_with_amount  ws* '[' contains:ingredient_nested_in ws* ']'                                 <NestedIngredientNode> )
+      ( ing:ingredient_simple              (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
+      ( ing:ingredient_simple              (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')'               ws* amount:amount <NestedIngredientNode> ) /
+      ( ing:ingredient_simple_with_amount  (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark                   <NestedIngredientNode> ) /
+      ( ing:ingredient_simple_with_amount  (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')'                                 <NestedIngredientNode> ) /
+      ( ing:ingredient_simple              (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
+      ( ing:ingredient_simple              (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']'               ws* amount:amount <NestedIngredientNode> ) /
+      ( ing:ingredient_simple_with_amount  (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark                   <NestedIngredientNode> ) /
+      ( ing:ingredient_simple_with_amount  (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']'                                 <NestedIngredientNode> )
     end
     rule ingredient_nested_in

data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop CHANGED Viewed

@@ -5,9 +5,11 @@ module FoodIngredientParser::Strict::Grammar
     include Ingredient
     rule list_coloned
-      contains:( ( ws* list_coloned_ingredient ws* '.' )+ list_coloned_ingredient? ) <ListNode> /
-      contains:( ( ws* list_coloned_ingredient ws* ';' )+ list_coloned_ingredient? ) <ListNode> /
-      contains:(   ws* list_coloned_ingredient )                                     <ListNode>
+      contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* '.' )+                             ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
+      contains:( ( ws* list_coloned_ingredient ws* ';' )+                             ) <ListNode> /
+      contains:(   ws* list_coloned_ingredient )                                        <ListNode>
     end
     rule list_coloned_inner_list

data/lib/food_ingredient_parser/strict/nodes.rb CHANGED Viewed

@@ -47,7 +47,7 @@ module FoodIngredientParser::Strict
         h[:name] = name.text_value if respond_to?(:name)
         h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
         h[:name] = h[:name] + post.text_value if respond_to?(:post)
-        h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
+        h[:marks] = [mark.text_value] if respond_to?(:mark) && mark.text_value != ''
         h
       end
     end

data/lib/food_ingredient_parser/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module FoodIngredientParser
-  VERSION      = '1.0.0.pre.8'
+  VERSION      = '1.0.0.pre.9'
   VERSION_DATE = '2018-09-19'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: food_ingredient_parser
 version: !ruby/object:Gem::Version
-  version: 1.0.0.pre.8
+  version: 1.0.0.pre.9
 platform: ruby
 authors:
 - wvengen
@@ -48,6 +48,7 @@ files:
 - lib/food_ingredient_parser/loose/scanner.rb
 - lib/food_ingredient_parser/loose/transform/amount.rb
 - lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
+- lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
 - lib/food_ingredient_parser/strict/grammar.rb
 - lib/food_ingredient_parser/strict/grammar/amount.treetop
 - lib/food_ingredient_parser/strict/grammar/common.treetop