food_ingredient_parser 1.1.9 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/bin/food_ingredient_parser +20 -10
- data/lib/food_ingredient_parser/loose/scanner.rb +19 -4
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +2 -2
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +3 -2
- data/lib/food_ingredient_parser/strict/grammar/ingredient.treetop +6 -1
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +10 -10
- data/lib/food_ingredient_parser/strict/grammar/root.treetop +3 -2
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
|
4
|
+
data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
|
7
|
+
data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
|
data/README.md
CHANGED
@@ -69,6 +69,7 @@ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
|
|
69
69
|
-r, --parser PARSER Use specific parser (strict, loose).
|
70
70
|
-e, --[no-]escape Escape newlines
|
71
71
|
-c, --[no-]color Use color
|
72
|
+
--[no-]html Print as HTML with parsing markup
|
72
73
|
-v, --[no-]verbose Show more data (parsed tree).
|
73
74
|
--version Show program version.
|
74
75
|
-h, --help Show this help
|
@@ -103,6 +104,9 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
|
|
103
104
|
SyntaxNode offset=6, ""
|
104
105
|
{:contains=>[{:name=>"tomato"}]}
|
105
106
|
|
107
|
+
$ food_ingredient_parser --html -s "tomato"
|
108
|
+
<div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
|
109
|
+
|
106
110
|
$ food_ingredient_parser -v -r loose -s "tomato"
|
107
111
|
"tomato"
|
108
112
|
Node interval=0..5
|
@@ -193,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
|
|
193
197
|
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
194
198
|
|
195
199
|
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
196
|
-
parse less well (that
|
200
|
+
parse less well (that is probably a matter of tine-tuning).
|
197
201
|
|
198
202
|
## Test data
|
199
203
|
|
data/bin/food_ingredient_parser
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
#
|
3
3
|
# Parser for food ingredient lists.
|
4
4
|
#
|
5
|
+
require 'cgi'
|
5
6
|
require 'optparse'
|
6
7
|
|
7
8
|
$:.push(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
@@ -31,24 +32,31 @@ def colorize(color, s)
|
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
|
-
def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
|
35
|
+
def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
|
35
36
|
parsed ||= parser.parse(s)
|
36
37
|
|
37
38
|
return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
|
38
39
|
|
39
|
-
puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if verbosity > 0
|
40
|
+
puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if !html && verbosity > 0
|
40
41
|
|
41
|
-
if parsed
|
42
|
+
if !html && parsed
|
42
43
|
puts(parsed.inspect) if verbosity > 1
|
43
44
|
pp(parsed.to_h, color: color) if verbosity > 0
|
44
|
-
|
45
|
-
|
45
|
+
|
46
|
+
elsif !html && !parsed
|
46
47
|
puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
|
47
|
-
|
48
|
+
|
49
|
+
elsif html && parsed
|
50
|
+
puts('<div class="root">' + parsed.to_html + '</div>') if verbosity > 0
|
51
|
+
|
52
|
+
else
|
53
|
+
puts('<div class="root">' + CGI.escapeHTML(parsed) + '</div>') if verbosity > 0
|
48
54
|
end
|
55
|
+
|
56
|
+
return !!parsed
|
49
57
|
end
|
50
58
|
|
51
|
-
def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
|
59
|
+
def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
|
52
60
|
count_parsed = count_noresult = 0
|
53
61
|
File.foreach(path) do |line|
|
54
62
|
next if line =~ /^#/ # comment
|
@@ -59,7 +67,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
|
|
59
67
|
count_parsed += 1 if parsed
|
60
68
|
count_noresult += 1 unless parsed
|
61
69
|
|
62
|
-
parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color)
|
70
|
+
parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html)
|
63
71
|
end
|
64
72
|
|
65
73
|
pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
|
@@ -75,6 +83,7 @@ print = nil
|
|
75
83
|
parser_name = :strict
|
76
84
|
escape = false
|
77
85
|
color = true
|
86
|
+
html = false
|
78
87
|
PARSERS = {
|
79
88
|
strict: FoodIngredientParser::Strict::Parser,
|
80
89
|
loose: FoodIngredientParser::Loose::Parser
|
@@ -95,6 +104,7 @@ OptionParser.new do |opts|
|
|
95
104
|
opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
|
96
105
|
opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
|
97
106
|
opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
|
107
|
+
opts.on( "--[no-]html", "Print as HTML with parsing markup") {|e| html = !!e }
|
98
108
|
opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
|
99
109
|
opts.on( "--version", "Show program version.") do
|
100
110
|
puts("food_ingredient_parser v#{FoodIngredientParser::VERSION}")
|
@@ -112,8 +122,8 @@ if strings.any? || files.any?
|
|
112
122
|
exit(1)
|
113
123
|
end
|
114
124
|
success = true
|
115
|
-
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
|
116
|
-
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
|
125
|
+
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) }
|
126
|
+
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) == 0 }
|
117
127
|
success or exit(1)
|
118
128
|
else
|
119
129
|
STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
|
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
|
|
4
4
|
class Scanner
|
5
5
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
|
+
AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
|
7
8
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
8
|
-
PREFIX_RE = /\A\s*(ingredients
|
9
|
+
PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
10
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
10
11
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
12
|
ABBREV_RE = Regexp.union(
|
@@ -19,12 +20,12 @@ module FoodIngredientParser::Loose
|
|
19
20
|
T\.\s*aestivum\b(\s+vitt\.)? |
|
20
21
|
nucifera\s+L\. |
|
21
22
|
type\s+"\d+" |
|
22
|
-
E
|
23
|
+
E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\]) |
|
23
24
|
www\.[-_\/:%.A-Za-z0-9]+
|
24
25
|
)/xi,
|
25
26
|
*%w[
|
26
|
-
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s
|
27
|
-
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
27
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
|
28
|
+
i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
28
29
|
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
29
30
|
min max ca
|
30
31
|
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
|
|
75
76
|
elsif ")]".include?(c) # close nesting
|
76
77
|
add_child
|
77
78
|
close_parent
|
79
|
+
# after bracket check for 'and' to not lose text
|
80
|
+
if is_and_sep?(@i+1)
|
81
|
+
@i += and_sep_len(@i+1)
|
82
|
+
add_child
|
83
|
+
end
|
78
84
|
elsif is_notes_start? # usually a dot marks the start of notes
|
79
85
|
close_all_ancestors
|
80
86
|
@iterator = :notes
|
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
|
|
148
154
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
149
155
|
end
|
150
156
|
|
157
|
+
def is_and_sep?(i = @i)
|
158
|
+
and_sep_len(i) > 0
|
159
|
+
end
|
160
|
+
|
161
|
+
def and_sep_len(i = @i)
|
162
|
+
m = @s[i..-1].match(AND_SEP_RE)
|
163
|
+
m ? m.offset(0).last : 0
|
164
|
+
end
|
165
|
+
|
151
166
|
def is_mark?(i = @i)
|
152
167
|
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
153
168
|
end
|
@@ -7,8 +7,8 @@ module FoodIngredientParser::Loose
|
|
7
7
|
#
|
8
8
|
# @note mark and amount is lost, this is not expected on e-numbers
|
9
9
|
|
10
|
-
SPLIT_RE = /\s
|
11
|
-
SINGLE_RE = /E
|
10
|
+
SPLIT_RE = /\s*(-|\ben\b|\band\b|\bund\b|\bet\b)\s*/.freeze
|
11
|
+
SINGLE_RE = /E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\])?/i.freeze
|
12
12
|
MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
|
13
13
|
|
14
14
|
def self.transform!(node)
|
@@ -58,8 +58,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
58
58
|
end
|
59
59
|
|
60
60
|
rule e_number
|
61
|
-
( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
|
62
|
-
( ( ws* '(' [iIvV]+ ')' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
|
61
|
+
( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
|
62
|
+
( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
|
63
63
|
end
|
64
64
|
|
65
65
|
rule chem_systematic_name
|
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
102
102
|
'e.u'i /
|
103
103
|
'f.i.l'i /
|
104
104
|
'f.o.s'i /
|
105
|
+
'h.o.h'i /
|
105
106
|
'i.a'i /
|
106
107
|
'i.d'i /
|
107
108
|
'i.e'i /
|
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include IngredientColoned
|
6
6
|
|
7
7
|
rule ingredient
|
8
|
-
ws*
|
8
|
+
ws*
|
9
|
+
(
|
10
|
+
ingredient_nested ( ws* and ws+ ingredient )? /
|
11
|
+
ingredient_coloned /
|
12
|
+
ingredient_simple_with_amount
|
13
|
+
)
|
9
14
|
end
|
10
15
|
|
11
16
|
end
|
@@ -5,15 +5,15 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include Ingredient
|
6
6
|
|
7
7
|
rule list_coloned
|
8
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
9
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
10
|
-
contains:( ( ws* list_coloned_ingredient ws* '.'
|
11
|
-
contains:( ( ws* list_coloned_ingredient ws* '.'
|
12
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
13
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
14
|
-
contains:( ( ws* list_coloned_ingredient ws* ';'
|
15
|
-
contains:( ( ws* list_coloned_ingredient ws* ';'
|
16
|
-
contains:( ws* list_coloned_ingredient )
|
8
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
|
9
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ) <ListNode> /
|
10
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
|
11
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
|
12
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
|
13
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ) <ListNode> /
|
14
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
|
15
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
|
16
|
+
contains:( ws* list_coloned_ingredient ) <ListNode>
|
17
17
|
end
|
18
18
|
|
19
19
|
rule list_coloned_inner_list
|
@@ -22,7 +22,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
22
22
|
end
|
23
23
|
|
24
24
|
rule list_coloned_ingredient
|
25
|
-
ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )?
|
25
|
+
ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? !( ws* word ) <IngredientNode> /
|
26
26
|
ing:ingredient_simple_with_amount ws* ':' post:( ws* '}' )? ws* contains:list_coloned_inner_list <NestedIngredientNode>
|
27
27
|
end
|
28
28
|
|
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
|
|
19
19
|
|
20
20
|
rule root_prefix
|
21
21
|
(
|
22
|
-
'ingredients'i / 'contains'i /
|
22
|
+
'ingredients'i ( ws+ 'list'i )? / 'contains'i /
|
23
23
|
('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
|
24
|
-
'zutaten'i
|
24
|
+
'zutaten'i /
|
25
|
+
'ingredienser'i
|
25
26
|
)
|
26
27
|
( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
|
27
28
|
"'"? ws* # stray quote occurs sometimes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
|
-
rubygems_version: 3.
|
90
|
+
rubygems_version: 3.1.6
|
91
91
|
signing_key:
|
92
92
|
specification_version: 4
|
93
93
|
summary: Parser for ingredient lists found on food products.
|