food_ingredient_parser 1.1.9 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e83f2b90f83d4cb0deb193140dcce6f4ee1700a80c781c95b7fa09219da9f69
4
- data.tar.gz: '0929b958e93cf8e61e54b5e268ad32135aa45e87d1c163ae912790de55a85e28'
3
+ metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
4
+ data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
5
5
  SHA512:
6
- metadata.gz: 929fd7057be9e1f35ec1db55e08bad1078d8bc1f977b07b25c26f3f0ce0919e0cb82ca4379e050067b20ab5cee760fb10c2d267d60fc7555d18af1e8ae9ca9e9
7
- data.tar.gz: 4873942e33ad823fbf84391cc70c4f4760cf0d93cceced307814d3abf73f5e91af8529fcfa048345b0a896dd967deac729d9051ab7e89b3c4174c8ee42f9f61b
6
+ metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
7
+ data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
data/README.md CHANGED
@@ -69,6 +69,7 @@ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
69
69
  -r, --parser PARSER Use specific parser (strict, loose).
70
70
  -e, --[no-]escape Escape newlines
71
71
  -c, --[no-]color Use color
72
+ --[no-]html Print as HTML with parsing markup
72
73
  -v, --[no-]verbose Show more data (parsed tree).
73
74
  --version Show program version.
74
75
  -h, --help Show this help
@@ -103,6 +104,9 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
103
104
  SyntaxNode offset=6, ""
104
105
  {:contains=>[{:name=>"tomato"}]}
105
106
 
107
+ $ food_ingredient_parser --html -s "tomato"
108
+ <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
109
+
106
110
  $ food_ingredient_parser -v -r loose -s "tomato"
107
111
  "tomato"
108
112
  Node interval=0..5
@@ -193,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
193
197
  areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
198
 
195
199
  Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
- parse less well (that that's a matter of tine-tuning).
200
+ parse less well (that is probably a matter of tine-tuning).
197
201
 
198
202
  ## Test data
199
203
 
@@ -2,6 +2,7 @@
2
2
  #
3
3
  # Parser for food ingredient lists.
4
4
  #
5
+ require 'cgi'
5
6
  require 'optparse'
6
7
 
7
8
  $:.push(File.expand_path(File.dirname(__FILE__) + "/../lib"))
@@ -31,24 +32,31 @@ def colorize(color, s)
31
32
  end
32
33
  end
33
34
 
34
- def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
35
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
35
36
  parsed ||= parser.parse(s)
36
37
 
37
38
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
38
39
 
39
- puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if verbosity > 0
40
+ puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if !html && verbosity > 0
40
41
 
41
- if parsed
42
+ if !html && parsed
42
43
  puts(parsed.inspect) if verbosity > 1
43
44
  pp(parsed.to_h, color: color) if verbosity > 0
44
- return true
45
- else
45
+
46
+ elsif !html && !parsed
46
47
  puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
47
- return false
48
+
49
+ elsif html && parsed
50
+ puts('<div class="root">' + parsed.to_html + '</div>') if verbosity > 0
51
+
52
+ else
53
+ puts('<div class="root">' + CGI.escapeHTML(parsed) + '</div>') if verbosity > 0
48
54
  end
55
+
56
+ return !!parsed
49
57
  end
50
58
 
51
- def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
59
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
52
60
  count_parsed = count_noresult = 0
53
61
  File.foreach(path) do |line|
54
62
  next if line =~ /^#/ # comment
@@ -59,7 +67,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
59
67
  count_parsed += 1 if parsed
60
68
  count_noresult += 1 unless parsed
61
69
 
62
- parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color)
70
+ parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html)
63
71
  end
64
72
 
65
73
  pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
@@ -75,6 +83,7 @@ print = nil
75
83
  parser_name = :strict
76
84
  escape = false
77
85
  color = true
86
+ html = false
78
87
  PARSERS = {
79
88
  strict: FoodIngredientParser::Strict::Parser,
80
89
  loose: FoodIngredientParser::Loose::Parser
@@ -95,6 +104,7 @@ OptionParser.new do |opts|
95
104
  opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
96
105
  opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
97
106
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
107
+ opts.on( "--[no-]html", "Print as HTML with parsing markup") {|e| html = !!e }
98
108
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
99
109
  opts.on( "--version", "Show program version.") do
100
110
  puts("food_ingredient_parser v#{FoodIngredientParser::VERSION}")
@@ -112,8 +122,8 @@ if strings.any? || files.any?
112
122
  exit(1)
113
123
  end
114
124
  success = true
115
- strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
116
- files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
125
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) }
126
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) == 0 }
117
127
  success or exit(1)
118
128
  else
119
129
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
+ AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
7
8
  MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
- PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
+ PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
10
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
11
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
12
  ABBREV_RE = Regexp.union(
@@ -19,12 +20,12 @@ module FoodIngredientParser::Loose
19
20
  T\.\s*aestivum\b(\s+vitt\.)? |
20
21
  nucifera\s+L\. |
21
22
  type\s+"\d+" |
22
- E[- ]?\d{3}[a-z]?\s*\([iv]+\) |
23
+ E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\]) |
23
24
  www\.[-_\/:%.A-Za-z0-9]+
24
25
  )/xi,
25
26
  *%w[
26
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
27
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
28
+ i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
29
  p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
30
  min max ca
30
31
  ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
75
76
  elsif ")]".include?(c) # close nesting
76
77
  add_child
77
78
  close_parent
79
+ # after bracket check for 'and' to not lose text
80
+ if is_and_sep?(@i+1)
81
+ @i += and_sep_len(@i+1)
82
+ add_child
83
+ end
78
84
  elsif is_notes_start? # usually a dot marks the start of notes
79
85
  close_all_ancestors
80
86
  @iterator = :notes
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
148
154
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
149
155
  end
150
156
 
157
+ def is_and_sep?(i = @i)
158
+ and_sep_len(i) > 0
159
+ end
160
+
161
+ def and_sep_len(i = @i)
162
+ m = @s[i..-1].match(AND_SEP_RE)
163
+ m ? m.offset(0).last : 0
164
+ end
165
+
151
166
  def is_mark?(i = @i)
152
167
  mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
153
168
  end
@@ -7,8 +7,8 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\([iv]+\))?/i.freeze
10
+ SPLIT_RE = /\s*(-|\ben\b|\band\b|\bund\b|\bet\b)\s*/.freeze
11
+ SINGLE_RE = /E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\])?/i.freeze
12
12
  MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
13
13
 
14
14
  def self.transform!(node)
@@ -58,8 +58,8 @@ module FoodIngredientParser::Strict::Grammar
58
58
  end
59
59
 
60
60
  rule e_number
61
- ( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
- ( ( ws* '(' [iIvV]+ ')' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
61
+ ( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
+ ( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
63
63
  end
64
64
 
65
65
  rule chem_systematic_name
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
102
102
  'e.u'i /
103
103
  'f.i.l'i /
104
104
  'f.o.s'i /
105
+ 'h.o.h'i /
105
106
  'i.a'i /
106
107
  'i.d'i /
107
108
  'i.e'i /
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include IngredientColoned
6
6
 
7
7
  rule ingredient
8
- ws* ( ingredient_nested / ingredient_coloned / ingredient_simple_with_amount )
8
+ ws*
9
+ (
10
+ ingredient_nested ( ws* and ws+ ingredient )? /
11
+ ingredient_coloned /
12
+ ingredient_simple_with_amount
13
+ )
9
14
  end
10
15
 
11
16
  end
@@ -5,15 +5,15 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
10
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
14
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
- contains:( ws* list_coloned_ingredient ) <ListNode>
8
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ) <ListNode> /
10
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ) <ListNode> /
14
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
+ contains:( ws* list_coloned_ingredient ) <ListNode>
17
17
  end
18
18
 
19
19
  rule list_coloned_inner_list
@@ -22,7 +22,7 @@ module FoodIngredientParser::Strict::Grammar
22
22
  end
23
23
 
24
24
  rule list_coloned_ingredient
25
- ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? <IngredientNode> /
25
+ ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? !( ws* word ) <IngredientNode> /
26
26
  ing:ingredient_simple_with_amount ws* ':' post:( ws* '}' )? ws* contains:list_coloned_inner_list <NestedIngredientNode>
27
27
  end
28
28
 
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
19
19
 
20
20
  rule root_prefix
21
21
  (
22
- 'ingredients'i / 'contains'i /
22
+ 'ingredients'i ( ws+ 'list'i )? / 'contains'i /
23
23
  ('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
24
- 'zutaten'i
24
+ 'zutaten'i /
25
+ 'ingredienser'i
25
26
  )
26
27
  ( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
27
28
  "'"? ws* # stray quote occurs sometimes
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.9'
3
- VERSION_DATE = '2021-01-12'
2
+ VERSION = '1.2.0'
3
+ VERSION_DATE = '2024-01-19'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-12 00:00:00.000000000 Z
11
+ date: 2024-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubygems_version: 3.0.3
90
+ rubygems_version: 3.1.6
91
91
  signing_key:
92
92
  specification_version: 4
93
93
  summary: Parser for ingredient lists found on food products.