food_ingredient_parser 1.1.9 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e83f2b90f83d4cb0deb193140dcce6f4ee1700a80c781c95b7fa09219da9f69
4
- data.tar.gz: '0929b958e93cf8e61e54b5e268ad32135aa45e87d1c163ae912790de55a85e28'
3
+ metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
4
+ data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
5
5
  SHA512:
6
- metadata.gz: 929fd7057be9e1f35ec1db55e08bad1078d8bc1f977b07b25c26f3f0ce0919e0cb82ca4379e050067b20ab5cee760fb10c2d267d60fc7555d18af1e8ae9ca9e9
7
- data.tar.gz: 4873942e33ad823fbf84391cc70c4f4760cf0d93cceced307814d3abf73f5e91af8529fcfa048345b0a896dd967deac729d9051ab7e89b3c4174c8ee42f9f61b
6
+ metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
7
+ data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
data/README.md CHANGED
@@ -69,6 +69,7 @@ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
69
69
  -r, --parser PARSER Use specific parser (strict, loose).
70
70
  -e, --[no-]escape Escape newlines
71
71
  -c, --[no-]color Use color
72
+ --[no-]html Print as HTML with parsing markup
72
73
  -v, --[no-]verbose Show more data (parsed tree).
73
74
  --version Show program version.
74
75
  -h, --help Show this help
@@ -103,6 +104,9 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
103
104
  SyntaxNode offset=6, ""
104
105
  {:contains=>[{:name=>"tomato"}]}
105
106
 
107
+ $ food_ingredient_parser --html -s "tomato"
108
+ <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
109
+
106
110
  $ food_ingredient_parser -v -r loose -s "tomato"
107
111
  "tomato"
108
112
  Node interval=0..5
@@ -193,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
193
197
  areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
198
 
195
199
  Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
- parse less well (that that's a matter of tine-tuning).
200
+ parse less well (that is probably a matter of tine-tuning).
197
201
 
198
202
  ## Test data
199
203
 
@@ -2,6 +2,7 @@
2
2
  #
3
3
  # Parser for food ingredient lists.
4
4
  #
5
+ require 'cgi'
5
6
  require 'optparse'
6
7
 
7
8
  $:.push(File.expand_path(File.dirname(__FILE__) + "/../lib"))
@@ -31,24 +32,31 @@ def colorize(color, s)
31
32
  end
32
33
  end
33
34
 
34
- def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
35
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
35
36
  parsed ||= parser.parse(s)
36
37
 
37
38
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
38
39
 
39
- puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if verbosity > 0
40
+ puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if !html && verbosity > 0
40
41
 
41
- if parsed
42
+ if !html && parsed
42
43
  puts(parsed.inspect) if verbosity > 1
43
44
  pp(parsed.to_h, color: color) if verbosity > 0
44
- return true
45
- else
45
+
46
+ elsif !html && !parsed
46
47
  puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
47
- return false
48
+
49
+ elsif html && parsed
50
+ puts('<div class="root">' + parsed.to_html + '</div>') if verbosity > 0
51
+
52
+ else
53
+ puts('<div class="root">' + CGI.escapeHTML(parsed) + '</div>') if verbosity > 0
48
54
  end
55
+
56
+ return !!parsed
49
57
  end
50
58
 
51
- def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
59
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
52
60
  count_parsed = count_noresult = 0
53
61
  File.foreach(path) do |line|
54
62
  next if line =~ /^#/ # comment
@@ -59,7 +67,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
59
67
  count_parsed += 1 if parsed
60
68
  count_noresult += 1 unless parsed
61
69
 
62
- parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color)
70
+ parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html)
63
71
  end
64
72
 
65
73
  pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
@@ -75,6 +83,7 @@ print = nil
75
83
  parser_name = :strict
76
84
  escape = false
77
85
  color = true
86
+ html = false
78
87
  PARSERS = {
79
88
  strict: FoodIngredientParser::Strict::Parser,
80
89
  loose: FoodIngredientParser::Loose::Parser
@@ -95,6 +104,7 @@ OptionParser.new do |opts|
95
104
  opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
96
105
  opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
97
106
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
107
+ opts.on( "--[no-]html", "Print as HTML with parsing markup") {|e| html = !!e }
98
108
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
99
109
  opts.on( "--version", "Show program version.") do
100
110
  puts("food_ingredient_parser v#{FoodIngredientParser::VERSION}")
@@ -112,8 +122,8 @@ if strings.any? || files.any?
112
122
  exit(1)
113
123
  end
114
124
  success = true
115
- strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
116
- files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
125
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) }
126
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) == 0 }
117
127
  success or exit(1)
118
128
  else
119
129
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
+ AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
7
8
  MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
- PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
+ PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
10
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
11
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
12
  ABBREV_RE = Regexp.union(
@@ -19,12 +20,12 @@ module FoodIngredientParser::Loose
19
20
  T\.\s*aestivum\b(\s+vitt\.)? |
20
21
  nucifera\s+L\. |
21
22
  type\s+"\d+" |
22
- E[- ]?\d{3}[a-z]?\s*\([iv]+\) |
23
+ E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\]) |
23
24
  www\.[-_\/:%.A-Za-z0-9]+
24
25
  )/xi,
25
26
  *%w[
26
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
27
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
28
+ i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
29
  p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
30
  min max ca
30
31
  ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
75
76
  elsif ")]".include?(c) # close nesting
76
77
  add_child
77
78
  close_parent
79
+ # after bracket check for 'and' to not lose text
80
+ if is_and_sep?(@i+1)
81
+ @i += and_sep_len(@i+1)
82
+ add_child
83
+ end
78
84
  elsif is_notes_start? # usually a dot marks the start of notes
79
85
  close_all_ancestors
80
86
  @iterator = :notes
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
148
154
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
149
155
  end
150
156
 
157
+ def is_and_sep?(i = @i)
158
+ and_sep_len(i) > 0
159
+ end
160
+
161
+ def and_sep_len(i = @i)
162
+ m = @s[i..-1].match(AND_SEP_RE)
163
+ m ? m.offset(0).last : 0
164
+ end
165
+
151
166
  def is_mark?(i = @i)
152
167
  mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
153
168
  end
@@ -7,8 +7,8 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\([iv]+\))?/i.freeze
10
+ SPLIT_RE = /\s*(-|\ben\b|\band\b|\bund\b|\bet\b)\s*/.freeze
11
+ SINGLE_RE = /E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\])?/i.freeze
12
12
  MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
13
13
 
14
14
  def self.transform!(node)
@@ -58,8 +58,8 @@ module FoodIngredientParser::Strict::Grammar
58
58
  end
59
59
 
60
60
  rule e_number
61
- ( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
- ( ( ws* '(' [iIvV]+ ')' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
61
+ ( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
+ ( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
63
63
  end
64
64
 
65
65
  rule chem_systematic_name
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
102
102
  'e.u'i /
103
103
  'f.i.l'i /
104
104
  'f.o.s'i /
105
+ 'h.o.h'i /
105
106
  'i.a'i /
106
107
  'i.d'i /
107
108
  'i.e'i /
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include IngredientColoned
6
6
 
7
7
  rule ingredient
8
- ws* ( ingredient_nested / ingredient_coloned / ingredient_simple_with_amount )
8
+ ws*
9
+ (
10
+ ingredient_nested ( ws* and ws+ ingredient )? /
11
+ ingredient_coloned /
12
+ ingredient_simple_with_amount
13
+ )
9
14
  end
10
15
 
11
16
  end
@@ -5,15 +5,15 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
10
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
14
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
- contains:( ws* list_coloned_ingredient ) <ListNode>
8
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ) <ListNode> /
10
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ) <ListNode> /
14
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
+ contains:( ws* list_coloned_ingredient ) <ListNode>
17
17
  end
18
18
 
19
19
  rule list_coloned_inner_list
@@ -22,7 +22,7 @@ module FoodIngredientParser::Strict::Grammar
22
22
  end
23
23
 
24
24
  rule list_coloned_ingredient
25
- ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? <IngredientNode> /
25
+ ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? !( ws* word ) <IngredientNode> /
26
26
  ing:ingredient_simple_with_amount ws* ':' post:( ws* '}' )? ws* contains:list_coloned_inner_list <NestedIngredientNode>
27
27
  end
28
28
 
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
19
19
 
20
20
  rule root_prefix
21
21
  (
22
- 'ingredients'i / 'contains'i /
22
+ 'ingredients'i ( ws+ 'list'i )? / 'contains'i /
23
23
  ('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
24
- 'zutaten'i
24
+ 'zutaten'i /
25
+ 'ingredienser'i
25
26
  )
26
27
  ( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
27
28
  "'"? ws* # stray quote occurs sometimes
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.9'
3
- VERSION_DATE = '2021-01-12'
2
+ VERSION = '1.2.0'
3
+ VERSION_DATE = '2024-01-19'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-12 00:00:00.000000000 Z
11
+ date: 2024-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubygems_version: 3.0.3
90
+ rubygems_version: 3.1.6
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Parser for ingredient lists found on food products.