food_ingredient_parser 1.1.9 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/bin/food_ingredient_parser +20 -10
- data/lib/food_ingredient_parser/loose/scanner.rb +19 -4
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +2 -2
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +3 -2
- data/lib/food_ingredient_parser/strict/grammar/ingredient.treetop +6 -1
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +10 -10
- data/lib/food_ingredient_parser/strict/grammar/root.treetop +3 -2
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7c478a080e36c8f48ee3dbd6e9978eadec3758a4b0ab6fab571e18f103ed6bf0
|
|
4
|
+
data.tar.gz: aa078366f72ab03d038d497c908a3ad92f5816f37d3f0308fa64e81680905dea
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d8acbd71e431958a72350e6fd1d3e5e8d21db8ee53525c53a08bbe2c564734fca9601ac0fdc33d9737695f292bc7cd6da898721f02f68ca8f87175c5b276c709
|
|
7
|
+
data.tar.gz: f261a1537a6e903d55b36dc91c0a1c302893d7a092a83afb8598730c87142d041fdaf86540918697f297f5024461082e5e56594df64f603f56f4b22148c7c9fd
|
data/README.md
CHANGED
|
@@ -69,6 +69,7 @@ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
|
|
|
69
69
|
-r, --parser PARSER Use specific parser (strict, loose).
|
|
70
70
|
-e, --[no-]escape Escape newlines
|
|
71
71
|
-c, --[no-]color Use color
|
|
72
|
+
--[no-]html Print as HTML with parsing markup
|
|
72
73
|
-v, --[no-]verbose Show more data (parsed tree).
|
|
73
74
|
--version Show program version.
|
|
74
75
|
-h, --help Show this help
|
|
@@ -103,6 +104,9 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
|
|
|
103
104
|
SyntaxNode offset=6, ""
|
|
104
105
|
{:contains=>[{:name=>"tomato"}]}
|
|
105
106
|
|
|
107
|
+
$ food_ingredient_parser --html -s "tomato"
|
|
108
|
+
<div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
|
|
109
|
+
|
|
106
110
|
$ food_ingredient_parser -v -r loose -s "tomato"
|
|
107
111
|
"tomato"
|
|
108
112
|
Node interval=0..5
|
|
@@ -193,7 +197,7 @@ plus a bit of English and German. Support for other languages is already good, b
|
|
|
193
197
|
areas: improvements are welcome (starting with a corpus in [data/](data/)).
|
|
194
198
|
|
|
195
199
|
Many ingredient lists from the USA are structured a bit differently than those from Europe, they
|
|
196
|
-
parse less well (that
|
|
200
|
+
parse less well (that is probably a matter of tine-tuning).
|
|
197
201
|
|
|
198
202
|
## Test data
|
|
199
203
|
|
data/bin/food_ingredient_parser
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#
|
|
3
3
|
# Parser for food ingredient lists.
|
|
4
4
|
#
|
|
5
|
+
require 'cgi'
|
|
5
6
|
require 'optparse'
|
|
6
7
|
|
|
7
8
|
$:.push(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
|
@@ -31,24 +32,31 @@ def colorize(color, s)
|
|
|
31
32
|
end
|
|
32
33
|
end
|
|
33
34
|
|
|
34
|
-
def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
|
|
35
|
+
def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
|
|
35
36
|
parsed ||= parser.parse(s)
|
|
36
37
|
|
|
37
38
|
return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
|
|
38
39
|
|
|
39
|
-
puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if verbosity > 0
|
|
40
|
+
puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if !html && verbosity > 0
|
|
40
41
|
|
|
41
|
-
if parsed
|
|
42
|
+
if !html && parsed
|
|
42
43
|
puts(parsed.inspect) if verbosity > 1
|
|
43
44
|
pp(parsed.to_h, color: color) if verbosity > 0
|
|
44
|
-
|
|
45
|
-
|
|
45
|
+
|
|
46
|
+
elsif !html && !parsed
|
|
46
47
|
puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
|
|
47
|
-
|
|
48
|
+
|
|
49
|
+
elsif html && parsed
|
|
50
|
+
puts('<div class="root">' + parsed.to_html + '</div>') if verbosity > 0
|
|
51
|
+
|
|
52
|
+
else
|
|
53
|
+
puts('<div class="root">' + CGI.escapeHTML(parsed) + '</div>') if verbosity > 0
|
|
48
54
|
end
|
|
55
|
+
|
|
56
|
+
return !!parsed
|
|
49
57
|
end
|
|
50
58
|
|
|
51
|
-
def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
|
|
59
|
+
def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
|
|
52
60
|
count_parsed = count_noresult = 0
|
|
53
61
|
File.foreach(path) do |line|
|
|
54
62
|
next if line =~ /^#/ # comment
|
|
@@ -59,7 +67,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
|
|
|
59
67
|
count_parsed += 1 if parsed
|
|
60
68
|
count_noresult += 1 unless parsed
|
|
61
69
|
|
|
62
|
-
parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color)
|
|
70
|
+
parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html)
|
|
63
71
|
end
|
|
64
72
|
|
|
65
73
|
pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
|
|
@@ -75,6 +83,7 @@ print = nil
|
|
|
75
83
|
parser_name = :strict
|
|
76
84
|
escape = false
|
|
77
85
|
color = true
|
|
86
|
+
html = false
|
|
78
87
|
PARSERS = {
|
|
79
88
|
strict: FoodIngredientParser::Strict::Parser,
|
|
80
89
|
loose: FoodIngredientParser::Loose::Parser
|
|
@@ -95,6 +104,7 @@ OptionParser.new do |opts|
|
|
|
95
104
|
opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
|
|
96
105
|
opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
|
|
97
106
|
opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
|
|
107
|
+
opts.on( "--[no-]html", "Print as HTML with parsing markup") {|e| html = !!e }
|
|
98
108
|
opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
|
|
99
109
|
opts.on( "--version", "Show program version.") do
|
|
100
110
|
puts("food_ingredient_parser v#{FoodIngredientParser::VERSION}")
|
|
@@ -112,8 +122,8 @@ if strings.any? || files.any?
|
|
|
112
122
|
exit(1)
|
|
113
123
|
end
|
|
114
124
|
success = true
|
|
115
|
-
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
|
|
116
|
-
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
|
|
125
|
+
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) }
|
|
126
|
+
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) == 0 }
|
|
117
127
|
success or exit(1)
|
|
118
128
|
else
|
|
119
129
|
STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
|
|
@@ -4,8 +4,9 @@ module FoodIngredientParser::Loose
|
|
|
4
4
|
class Scanner
|
|
5
5
|
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
|
7
|
+
AND_SEP_RE = /\A\s*(and|en|und)\s+/i.freeze
|
|
7
8
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
|
8
|
-
PREFIX_RE = /\A\s*(ingredients
|
|
9
|
+
PREFIX_RE = /\A\s*(ingredients(\s*list)?|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
|
9
10
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
|
10
11
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
|
11
12
|
ABBREV_RE = Regexp.union(
|
|
@@ -19,12 +20,12 @@ module FoodIngredientParser::Loose
|
|
|
19
20
|
T\.\s*aestivum\b(\s+vitt\.)? |
|
|
20
21
|
nucifera\s+L\. |
|
|
21
22
|
type\s+"\d+" |
|
|
22
|
-
E
|
|
23
|
+
E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\]) |
|
|
23
24
|
www\.[-_\/:%.A-Za-z0-9]+
|
|
24
25
|
)/xi,
|
|
25
26
|
*%w[
|
|
26
|
-
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s
|
|
27
|
-
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
|
27
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s h.o.h
|
|
28
|
+
i.a i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
|
28
29
|
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
|
|
29
30
|
min max ca
|
|
30
31
|
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
|
|
@@ -75,6 +76,11 @@ module FoodIngredientParser::Loose
|
|
|
75
76
|
elsif ")]".include?(c) # close nesting
|
|
76
77
|
add_child
|
|
77
78
|
close_parent
|
|
79
|
+
# after bracket check for 'and' to not lose text
|
|
80
|
+
if is_and_sep?(@i+1)
|
|
81
|
+
@i += and_sep_len(@i+1)
|
|
82
|
+
add_child
|
|
83
|
+
end
|
|
78
84
|
elsif is_notes_start? # usually a dot marks the start of notes
|
|
79
85
|
close_all_ancestors
|
|
80
86
|
@iterator = :notes
|
|
@@ -148,6 +154,15 @@ module FoodIngredientParser::Loose
|
|
|
148
154
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
|
149
155
|
end
|
|
150
156
|
|
|
157
|
+
def is_and_sep?(i = @i)
|
|
158
|
+
and_sep_len(i) > 0
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def and_sep_len(i = @i)
|
|
162
|
+
m = @s[i..-1].match(AND_SEP_RE)
|
|
163
|
+
m ? m.offset(0).last : 0
|
|
164
|
+
end
|
|
165
|
+
|
|
151
166
|
def is_mark?(i = @i)
|
|
152
167
|
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
|
153
168
|
end
|
|
@@ -7,8 +7,8 @@ module FoodIngredientParser::Loose
|
|
|
7
7
|
#
|
|
8
8
|
# @note mark and amount is lost, this is not expected on e-numbers
|
|
9
9
|
|
|
10
|
-
SPLIT_RE = /\s
|
|
11
|
-
SINGLE_RE = /E
|
|
10
|
+
SPLIT_RE = /\s*(-|\ben\b|\band\b|\bund\b|\bet\b)\s*/.freeze
|
|
11
|
+
SINGLE_RE = /E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\])?/i.freeze
|
|
12
12
|
MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
|
|
13
13
|
|
|
14
14
|
def self.transform!(node)
|
|
@@ -58,8 +58,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
58
58
|
end
|
|
59
59
|
|
|
60
60
|
rule e_number
|
|
61
|
-
( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
|
|
62
|
-
( ( ws* '(' [iIvV]+ ')' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
|
|
61
|
+
( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
|
|
62
|
+
( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
|
|
63
63
|
end
|
|
64
64
|
|
|
65
65
|
rule chem_systematic_name
|
|
@@ -102,6 +102,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
102
102
|
'e.u'i /
|
|
103
103
|
'f.i.l'i /
|
|
104
104
|
'f.o.s'i /
|
|
105
|
+
'h.o.h'i /
|
|
105
106
|
'i.a'i /
|
|
106
107
|
'i.d'i /
|
|
107
108
|
'i.e'i /
|
|
@@ -5,7 +5,12 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
5
5
|
include IngredientColoned
|
|
6
6
|
|
|
7
7
|
rule ingredient
|
|
8
|
-
ws*
|
|
8
|
+
ws*
|
|
9
|
+
(
|
|
10
|
+
ingredient_nested ( ws* and ws+ ingredient )? /
|
|
11
|
+
ingredient_coloned /
|
|
12
|
+
ingredient_simple_with_amount
|
|
13
|
+
)
|
|
9
14
|
end
|
|
10
15
|
|
|
11
16
|
end
|
|
@@ -5,15 +5,15 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
5
5
|
include Ingredient
|
|
6
6
|
|
|
7
7
|
rule list_coloned
|
|
8
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
|
9
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
|
10
|
-
contains:( ( ws* list_coloned_ingredient ws* '.'
|
|
11
|
-
contains:( ( ws* list_coloned_ingredient ws* '.'
|
|
12
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
|
13
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
|
14
|
-
contains:( ( ws* list_coloned_ingredient ws* ';'
|
|
15
|
-
contains:( ( ws* list_coloned_ingredient ws* ';'
|
|
16
|
-
contains:( ws* list_coloned_ingredient )
|
|
8
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
9
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ) <ListNode> /
|
|
10
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
11
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
|
|
12
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
13
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ) <ListNode> /
|
|
14
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
|
|
15
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
|
|
16
|
+
contains:( ws* list_coloned_ingredient ) <ListNode>
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
rule list_coloned_inner_list
|
|
@@ -22,7 +22,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
22
22
|
end
|
|
23
23
|
|
|
24
24
|
rule list_coloned_ingredient
|
|
25
|
-
ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )?
|
|
25
|
+
ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? !( ws* word ) <IngredientNode> /
|
|
26
26
|
ing:ingredient_simple_with_amount ws* ':' post:( ws* '}' )? ws* contains:list_coloned_inner_list <NestedIngredientNode>
|
|
27
27
|
end
|
|
28
28
|
|
|
@@ -19,9 +19,10 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
19
19
|
|
|
20
20
|
rule root_prefix
|
|
21
21
|
(
|
|
22
|
-
'ingredients'i / 'contains'i /
|
|
22
|
+
'ingredients'i ( ws+ 'list'i )? / 'contains'i /
|
|
23
23
|
('ingred'i [IÏiï] [EËeë] 'n'i ( 't'i 'en'i? 'declaratie'i? )? ) / 'bevat'i / 'dit zit er in'i / 'samenstelling'i /
|
|
24
|
-
'zutaten'i
|
|
24
|
+
'zutaten'i /
|
|
25
|
+
'ingredienser'i
|
|
25
26
|
)
|
|
26
27
|
( ws* [:;.] ( ws* newline )? / ws* newline / ws ) ws* # optional colon or other separator
|
|
27
28
|
"'"? ws* # stray quote occurs sometimes
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: food_ingredient_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- wvengen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-01-19 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: treetop
|
|
@@ -87,7 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
87
87
|
- !ruby/object:Gem::Version
|
|
88
88
|
version: '0'
|
|
89
89
|
requirements: []
|
|
90
|
-
rubygems_version: 3.
|
|
90
|
+
rubygems_version: 3.1.6
|
|
91
91
|
signing_key:
|
|
92
92
|
specification_version: 4
|
|
93
93
|
summary: Parser for ingredient lists found on food products.
|