RubyGems - food_ingredient_parser - Versions diffs - 1.1.10 → 1.3.0 - Mend

food_ingredient_parser 1.1.10 → 1.3.0

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a56d22b7e67a3a913b051bcbda8da885ddd467dc53f5a0df0faa5b40759a1f35
-  data.tar.gz: 427dd79c9f9203dc7901ead6264e08c05183d02aec266ac1d3bff930a5ba1dcd
+  metadata.gz: deb4cb55b3d5c41f02171e031fd11cc996cf2e8df9f074aa163efdff58baa6b0
+  data.tar.gz: 63dc1b52a15e6f70114cca9ed5d8a585a1f475d70131e4852310cf8755558dca
 SHA512:
-  metadata.gz: 0b07032ade3a55ce208bcb0c069223b41aee21f185a2b6a9bb91332881dfef8e1d829ae966097e48ffdba9984517be43b10bd027099f9bdce04e3a4c6fc41ca8
-  data.tar.gz: ebdf452a09d54b151ce8cfa9bb65b4477dd1afc81bfc5cd1d94055f726d387f522dd04a3e11b73d4b26222a18bc1068912a81c8e2e3cd8439b0cee1c1ec290d7
+  metadata.gz: 1cebae488578f1e00f8d905f34d39cef653cdcb4922d26878687afb3463ae3c24ca6592a6f19ac482b5a2f08e95feef342c9631da69acb99feea4e1a81269057
+  data.tar.gz: a7c8b98a5c3fd3aee8962e8f31cd9a0ede791e8d7c7193bfbb1ba2524be057bd7b38499973cd6ab6fea58020161ba6f8615dc5eef2191a87d45afec4662fa264

data/README.md CHANGED Viewed

@@ -104,7 +104,7 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
   SyntaxNode offset=6, ""
 {:contains=>[{:name=>"tomato"}]}
-$ bin/food_ingredient_parser --html -s "tomato"
+$ food_ingredient_parser --html -s "tomato"
 <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
 $ food_ingredient_parser -v -r loose -s "tomato"
@@ -197,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
 areas: improvements are welcome (starting with a corpus in [data/](data/)).
 Many ingredient lists from the USA are structured a bit differently than those from Europe, they
-parse less well (that that's a matter of tine-tuning).
+parse less well (that is probably a matter of tine-tuning).
 ## Test data

data/lib/food_ingredient_parser/loose/node.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
   class Node
     include ToHtml
-    attr_accessor :name, :mark, :amount, :contains, :notes
+    attr_accessor :name_parts, :mark, :amount, :contains, :notes
     attr_reader :input, :interval, :auto_close
     def initialize(input, interval, auto_close: false)
@@ -14,7 +14,8 @@ module FoodIngredientParser::Loose
       @auto_close = auto_close
       @contains = []
       @notes = []
-      @name = @mark = @amount = nil
+      @name_parts = []
+      @mark = @amount = nil
     end
     def ends(index)
@@ -31,7 +32,8 @@ module FoodIngredientParser::Loose
     def to_h
       r = {}
-      r[:name] = name.text_value.strip if name && name.text_value.strip != ''
+      _name = name
+      r[:name] = _name if _name
       r[:marks] = [mark.text_value.strip] if mark
       r[:amount] = amount.text_value.strip if amount
       r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
@@ -39,6 +41,11 @@ module FoodIngredientParser::Loose
       r
     end
+    def name
+      strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
+      return strings.any? ? strings.join(" ") : nil
+    end
     def inspect(indent="", variant="")
       inspect_self(indent, variant) +
       inspect_children(indent)
@@ -47,7 +54,7 @@ module FoodIngredientParser::Loose
     def inspect_self(indent="", variant="")
       [
         indent + "Node#{variant} interval=#{@interval}",
-        name ? "name=#{name.text_value.strip.inspect}" : nil,
+        name ? "name=#{name.inspect}" : nil,
         mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
         amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
         auto_close ? "auto_close" : nil

data/lib/food_ingredient_parser/loose/scanner.rb CHANGED Viewed

@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
   class Scanner
     SEP_CHARS  = "|;,.".freeze
+    AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
     MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
-    PREFIX_RE  = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
+    PREFIX_RE  = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
     NOTE_RE    = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
     # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
     ABBREV_RE  = Regexp.union(
@@ -23,8 +24,8 @@ module FoodIngredientParser::Loose
         www\.[-_\/:%.A-Za-z0-9]+
       )/xi,
       *%w[
-        a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
-        i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
+        a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
+        i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
         p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
         min max ca
       ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
@@ -32,8 +33,9 @@ module FoodIngredientParser::Loose
     def initialize(s, index: 0)
       @s = s                           # input string
-      @i = index                       # current index in string
+      @i = index                       # current index in string, the iterator looks at this character
       @cur = nil                       # current node we're populating
+      @curifree = nil                  # last index in string for current node that we haven't added to a child node yet
       @ancestors = [Node.new(@s, @i)]  # nesting hierarchy
       @iterator = :beginning           # scan_iteration_<iterator> to use for parsing
       @dest = :contains                # append current node to this attribute on parent
@@ -75,6 +77,12 @@ module FoodIngredientParser::Loose
       elsif ")]".include?(c)      # close nesting
         add_child
         close_parent
+        # after bracket check for 'and' to not lose text
+        if is_and_sep?(@i+1)
+          @i += and_sep_len(@i+1)
+          @curifree = @i # don't include 'and' in cur name
+          add_child
+        end
       elsif is_notes_start?       # usually a dot marks the start of notes
         close_all_ancestors
         @iterator = :notes
@@ -141,13 +149,26 @@ module FoodIngredientParser::Loose
     end
     def cur
-      @cur ||= Node.new(@s, @i)
+      if !@cur
+        @cur ||= Node.new(@s, @i)
+        @curifree = @i
+      end
+      @cur
     end
     def is_sep?(chars: SEP_CHARS)
       chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
     end
+    def is_and_sep?(i = @i)
+      and_sep_len(i) > 0
+    end
+    def and_sep_len(i = @i)
+      m = @s[i..-1].match(AND_SEP_RE)
+      m ? m.offset(0).last : 0
+    end
     def is_mark?(i = @i)
       mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
     end
@@ -186,16 +207,19 @@ module FoodIngredientParser::Loose
       cur.ends(@i-1)
       parent.send(@dest) << cur
       @cur = nil
+      @curifree = nil
     end
     def open_parent(**options)
       name_until_here
       @ancestors << cur
       @cur = Node.new(@s, @i + 1, **options)
+      @curifree = @i + 1
     end
     def close_parent
       return unless @ancestors.count > 1
+      @curifree = @i + 1
       @cur = @ancestors.pop
       while @cur.auto_close
         add_child
@@ -212,15 +236,15 @@ module FoodIngredientParser::Loose
     end
     def name_until_here
-      cur.name ||= begin
-        i, j = cur.interval.first, @i - 1
-        i += mark_len(i) # skip any mark in front
-        # Set name if there is any. There is one corner-case that needs to be avoided when
-        # a nesting was opened without a name, which would set the name to the nesting text.
-        # In this case, the name starts with an open-nesting symbol, which should never happen.
-        if j >= i && !"([:".include?(@s[i])
-          Node.new(@s, i .. j)
-        end
+      return unless @curifree # no cur started yet
+      i, j = @curifree, @i - 1
+      i += mark_len(i) # skip any mark in front
+      # Set name if there is any. There is one corner-case that needs to be avoided when
+      # a nesting was opened without a name, which would set the name to the nesting text.
+      # In this case, the name starts with an open-nesting symbol, which should never happen.
+      if j >= i && !"([:".include?(@s[i])
+        cur.name_parts << Node.new(@s, i .. j)
+        @curifree = @i
       end
     end

data/lib/food_ingredient_parser/loose/transform/amount.rb CHANGED Viewed

@@ -29,18 +29,26 @@ module FoodIngredientParser::Loose
       # Extract amount from name, if any.
       def transform_name(node = @node)
-        if !node.amount && parsed = parse_amount(node.name&.text_value)
-          offset = node.name.interval.first
+        if !node.amount
+          node.name_parts.each_with_index do |name, i|
+            parsed = parse_amount(name.text_value)
+            next unless parsed
+            offset = name.interval.first
-          amount = parsed.amount.amount
-          node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
+            amount = parsed.amount.amount
+            node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
-          name = parsed.respond_to?(:name) && parsed.name
-          if name && name.interval.count > 0
-            node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
-          else
-            node.name = nil
+            name = parsed.respond_to?(:name) && parsed.name
+            node.name_parts[i] = if name && name.interval.count > 0
+              Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
+            else
+              nil
+            end
+            # found an amount, stop looking in other parts
+            break
           end
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
         # recursively transform contained nodes

data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb CHANGED Viewed

@@ -42,7 +42,8 @@ module FoodIngredientParser::Loose
           # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
           transform_children!(child) if child.contains.any?
-          if child.name.nil? || child.name.text_value.strip == ''
+          name = child.name
+          if name.nil? || name == ''
             # Name is empty, we need to do something.
             if prev
               # there is a previous ingredient: move children to new parent

data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb CHANGED Viewed

@@ -29,21 +29,27 @@ module FoodIngredientParser::Loose
       def transform_node!(node)
         if node.contains.any?
           node.contains.each {|n| transform_node!(n) }
-        elsif node.name && m = MATCH_RE.match(node.name.text_value)
-          i = 0
-          while m = node.name.text_value.match(SPLIT_RE, i)
-            node.contains << new_node(node, i, m.begin(0)-1)
-            i = m.end(0)
+        else
+          node.name_parts.each_with_index do |name, name_index|
+            if m = MATCH_RE.match(name.text_value)
+              i = 0
+              while m = name.text_value.match(SPLIT_RE, i)
+                node.contains << new_node(name, i, m.begin(0)-1)
+                i = m.end(0)
+              end
+              node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
+              node.name_parts[name_index] = nil
+            end
           end
-          node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
-          node.name = nil
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
       end
-      def new_node(node, begins, ends)
-        offset = node.name.interval.first
-        new_node = Node.new(node.input, offset + begins .. offset + ends)
-        new_node.name = Node.new(node.input, new_node.interval)
+      def new_node(name, begins, ends)
+        offset = name.interval.first
+        new_node = Node.new(name.input, offset + begins .. offset + ends)
+        new_node.name_parts = [Node.new(name.input, new_node.interval)]
         new_node
       end
     end

data/lib/food_ingredient_parser/strict/grammar/common.treetop CHANGED Viewed

@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
         'e.u'i /
         'f.i.l'i /
         'f.o.s'i /
+        'h.o.h'i /
         'i.a'i /
         'i.d'i /
         'i.e'i /

data/lib/food_ingredient_parser/strict/grammar/ingredient.treetop CHANGED Viewed

@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
     include IngredientColoned
     rule ingredient
-      ws* ( ingredient_nested / ingredient_coloned / ingredient_simple_with_amount )
+      ws*
+      (
+        ingredient_nested ( ws* and ws+ ingredient )? /
+        ingredient_coloned /
+        ingredient_simple_with_amount
+      )
     end
   end

data/lib/food_ingredient_parser/strict/grammar/list.treetop CHANGED Viewed

@@ -4,11 +4,13 @@ module FoodIngredientParser::Strict::Grammar
     include Ingredient
     rule list
-      contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
-      contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
-      contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
-      contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
-      contains:(ingredient ( ws+ and ws+ ingredient )? )                             <ListNode>
+      contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? )      <ListNode> /
+      contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? )      <ListNode> /
+      contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? )      <ListNode> /
+      contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? )      <ListNode> /
+      contains:(ingredient_simple_e_number ( ws* '/'  ws* ingredient_simple_e_number )+ ) <ListNode> /
+      contains:(ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
+      contains:(ingredient ( ws+ and ws+ ingredient )? )                                  <ListNode>
     end
   end
 end

data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop CHANGED Viewed

@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
     end
     rule list_coloned_inner_list
+      contains:( ingredient_simple_e_number ( ws* '/'  ws* ingredient_simple_e_number )+ ) <ListNode> /
       contains:( ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
       contains:( ingredient ( ws* ',' ws* ingredient )* ) <ListNode>
     end

data/lib/food_ingredient_parser/strict/grammar/root.treetop CHANGED Viewed

@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
     rule root_prefix
       (
-        'ingredients'i / 'contains'i /
+        'ingredients'i ( ws+ 'list'i )? / 'contains'i /
         ('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
-        'zutaten'i
+        'zutaten'i /
+        'ingredienser'i
       )
       ( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws*  # optional colon or other separator
       "'"? ws*                                               # stray quote occurs sometimes

data/lib/food_ingredient_parser/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module FoodIngredientParser
-  VERSION      = '1.1.10'
-  VERSION_DATE = '2021-03-23'
+  VERSION      = '1.3.0'
+  VERSION_DATE = '2024-06-13'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: food_ingredient_parser
 version: !ruby/object:Gem::Version
-  version: 1.1.10
+  version: 1.3.0
 platform: ruby
 authors:
 - wvengen
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-03-23 00:00:00.000000000 Z
+date: 2024-06-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: treetop
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
+rubygems_version: 3.1.6
 signing_key:
 specification_version: 4
 summary: Parser for ingredient lists found on food products.