food_ingredient_parser 1.0.0.pre.8 → 1.0.0.pre.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -3
- data/lib/food_ingredient_parser/loose/node.rb +1 -1
- data/lib/food_ingredient_parser/loose/parser.rb +4 -1
- data/lib/food_ingredient_parser/loose/scanner.rb +31 -9
- data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +64 -0
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +16 -10
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +11 -0
- data/lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop +8 -8
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +5 -3
- data/lib/food_ingredient_parser/strict/nodes.rb +1 -1
- data/lib/food_ingredient_parser/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f08e6fcc9422b83503d37b41111f3bd540c11909
|
4
|
+
data.tar.gz: 16af14258ae67fa9b03b2b2196da9631d3dc6a9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b1bfd6c713f0117cc8c13f4110624d270cb471096577e2dc47ea43f6f70664c88c9a2bec444bac407f9424b3c147245e16d04b5b0e9bf8442273c8d90c27955c
|
7
|
+
data.tar.gz: 873de15303ea9bebb4ab9d3504851a4b364e1aef849f6c530235476a4d8e65867fe5426dd7b9274ecd99befb2de83746b8854a6ba08a24cb8caac7525973340b
|
data/README.md
CHANGED
@@ -29,7 +29,7 @@ Results in
|
|
29
29
|
```ruby
|
30
30
|
{
|
31
31
|
:contains=>[
|
32
|
-
{:name=>"Water", :amount=>"60%", :
|
32
|
+
{:name=>"Water", :amount=>"60%", :marks=>["*"]},
|
33
33
|
{:name=>"suiker", :amount=>"30%"},
|
34
34
|
{:name=>"voedingszuren", :contains=>[
|
35
35
|
{:name=>"citroenzuur"}
|
@@ -165,8 +165,9 @@ Even though the strict parser would not give a result, the loose parser returns:
|
|
165
165
|
{
|
166
166
|
:contains=>[
|
167
167
|
{:name=>"Saus", :contains=>[
|
168
|
-
{:name=>"tomaat", :
|
169
|
-
|
168
|
+
{:name=>"tomaat", :marks=>["*"], :amount=>"10%", {
|
169
|
+
:contains=>[{:name=>"zout"}
|
170
|
+
]},
|
170
171
|
{:name=>"peper"}
|
171
172
|
]}
|
172
173
|
]
|
@@ -28,7 +28,7 @@ module FoodIngredientParser::Loose
|
|
28
28
|
def to_h
|
29
29
|
r = {}
|
30
30
|
r[:name] = name.text_value.strip if name && name.text_value.strip != ''
|
31
|
-
r[:
|
31
|
+
r[:marks] = [mark.text_value.strip] if mark
|
32
32
|
r[:amount] = amount.text_value.strip if amount
|
33
33
|
r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
|
34
34
|
r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require_relative '../cleaner'
|
2
2
|
require_relative 'scanner'
|
3
3
|
require_relative 'transform/amount'
|
4
|
+
require_relative 'transform/handle_missing_name'
|
4
5
|
|
5
6
|
module FoodIngredientParser::Loose
|
6
7
|
class Parser
|
@@ -13,11 +14,13 @@ module FoodIngredientParser::Loose
|
|
13
14
|
# Parse food ingredient list text into a structured representation.
|
14
15
|
#
|
15
16
|
# @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
|
17
|
+
# @option normalize [Boolean] pass +false+ to disable some normalizations (handling missing names)
|
16
18
|
# @return [FoodIngredientParser::Loose::Node] structured representation of food ingredients
|
17
|
-
def parse(s, clean: true, **options)
|
19
|
+
def parse(s, clean: true, normalize: true, **options)
|
18
20
|
s = FoodIngredientParser::Cleaner.clean(s) if clean
|
19
21
|
n = Scanner.new(s).scan
|
20
22
|
n = Transform::Amount.transform!(n) if n
|
23
|
+
n = Transform::HandleMissingName.transform!(n) if n && normalize
|
21
24
|
n
|
22
25
|
end
|
23
26
|
end
|
@@ -5,8 +5,15 @@ module FoodIngredientParser::Loose
|
|
5
5
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
7
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
|
8
|
-
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\s*[:;.]
|
8
|
+
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
9
|
NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
|
10
|
+
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
|
+
ABBREV_RE = Regexp.union(%w[
|
12
|
+
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
13
|
+
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
14
|
+
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat N° °C
|
15
|
+
min max ca
|
16
|
+
].map {|s| /\A#{Regexp.escape(s)}\b\.?/})
|
10
17
|
|
11
18
|
def initialize(s, index: 0)
|
12
19
|
@s = s # input string
|
@@ -45,7 +52,10 @@ module FoodIngredientParser::Loose
|
|
45
52
|
end
|
46
53
|
|
47
54
|
def scan_iteration_standard
|
48
|
-
if
|
55
|
+
if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
|
56
|
+
cur # reference to record starting position
|
57
|
+
@i += len - 1
|
58
|
+
elsif "([".include?(c) # open nesting
|
49
59
|
open_parent
|
50
60
|
elsif ")]".include?(c) # close nesting
|
51
61
|
add_child
|
@@ -57,8 +67,12 @@ module FoodIngredientParser::Loose
|
|
57
67
|
elsif is_sep? # separator
|
58
68
|
add_child
|
59
69
|
elsif ":".include?(c) # another open nesting
|
60
|
-
|
61
|
-
|
70
|
+
if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
|
71
|
+
# ignore if before an open bracket, then it's a regular nesting
|
72
|
+
else
|
73
|
+
open_parent(auto_close: true)
|
74
|
+
@iterator = :colon
|
75
|
+
end
|
62
76
|
elsif is_mark? && !cur.mark # mark after ingredient
|
63
77
|
name_until_here
|
64
78
|
len = mark_len
|
@@ -70,7 +84,10 @@ module FoodIngredientParser::Loose
|
|
70
84
|
end
|
71
85
|
|
72
86
|
def scan_iteration_colon
|
73
|
-
if
|
87
|
+
if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
|
88
|
+
cur # reference to record starting position
|
89
|
+
@i += len - 1
|
90
|
+
elsif "/".include?(c) # slash separator in colon nesting only
|
74
91
|
add_child
|
75
92
|
elsif is_sep? # regular separator indicates end of colon nesting
|
76
93
|
add_child
|
@@ -108,14 +125,14 @@ module FoodIngredientParser::Loose
|
|
108
125
|
@cur ||= Node.new(@s, @i)
|
109
126
|
end
|
110
127
|
|
111
|
-
def is_mark?
|
112
|
-
mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
|
113
|
-
end
|
114
|
-
|
115
128
|
def is_sep?(chars: SEP_CHARS)
|
116
129
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
117
130
|
end
|
118
131
|
|
132
|
+
def is_mark?
|
133
|
+
mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
|
134
|
+
end
|
135
|
+
|
119
136
|
def mark_len
|
120
137
|
i = @i
|
121
138
|
while @s[i] && MARK_CHARS.include?(@s[i])
|
@@ -124,6 +141,11 @@ module FoodIngredientParser::Loose
|
|
124
141
|
i - @i
|
125
142
|
end
|
126
143
|
|
144
|
+
def abbrev_len
|
145
|
+
m = @s[@i .. -1].match(ABBREV_RE)
|
146
|
+
m ? m.offset(0).last : 0
|
147
|
+
end
|
148
|
+
|
127
149
|
def is_notes_start?
|
128
150
|
# @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
|
129
151
|
if ( is_mark? && @s[@i+mark_len..-1] =~ /\A\s*=/ ) || # "* = Biologisch"
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module FoodIngredientParser::Loose
|
2
|
+
module Transform
|
3
|
+
# Transforms node tree to handle missing names.
|
4
|
+
#
|
5
|
+
# The loose parser can return a node tree that has some ingredients without a name.
|
6
|
+
# Usually this means that either the parser wasn't smart enough to understand the input,
|
7
|
+
# or the input was not strictly clear (e.g. a case like "herbs, (oregano), salt" is often seen).
|
8
|
+
#
|
9
|
+
# When a contained node is found which doesn't have a name:
|
10
|
+
# * For the amount (if any): ignore it (as it's often ambiguous which ingredient it belongs to)
|
11
|
+
# * For the marks (if any): ignore it (we might instead add it to the containing ingredients)
|
12
|
+
# * For the containing ingredients (if any):
|
13
|
+
# - if the previous ingredient is present and doesn't contain ingredients already,
|
14
|
+
# assume the current contained ingredients are actually part of the previous ingredient.
|
15
|
+
# - if there is no previous ingredient, assume the nesting is wrong and insert them before
|
16
|
+
# the other ingredients one depth level above.
|
17
|
+
# - if there is a previous ingredient which contains ingredients, we can't make much of it,
|
18
|
+
# to avoid losing them, add them as contained ingredients to the previous ingredient.
|
19
|
+
#
|
20
|
+
class HandleMissingName
|
21
|
+
def self.transform!(node)
|
22
|
+
new(node).transform!
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(node)
|
26
|
+
@node = node
|
27
|
+
end
|
28
|
+
|
29
|
+
def transform!
|
30
|
+
transform_children!(@node)
|
31
|
+
@node
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def transform_children!(node)
|
37
|
+
prev = nil
|
38
|
+
new_contains = []
|
39
|
+
node.contains.each do |child|
|
40
|
+
# Apply recursively. Do it before processing to handle multiple depth levels of missing names.
|
41
|
+
transform_children!(child) if child.contains.any?
|
42
|
+
|
43
|
+
if child.name.nil? || child.name.text_value.strip == ''
|
44
|
+
# Name is empty, we need to do something.
|
45
|
+
if prev
|
46
|
+
# there is a previous ingredient: move children to new parent
|
47
|
+
prev.contains.push(*child.contains)
|
48
|
+
else
|
49
|
+
# there is no previous ingredient: move children one level up
|
50
|
+
new_contains.push(*child.contains)
|
51
|
+
end
|
52
|
+
else
|
53
|
+
# Nothing to see here, just leave it as it is.
|
54
|
+
new_contains << child
|
55
|
+
end
|
56
|
+
|
57
|
+
prev = child
|
58
|
+
end
|
59
|
+
|
60
|
+
node.contains = new_contains
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -3,29 +3,35 @@ module FoodIngredientParser::Strict::Grammar
|
|
3
3
|
include Common
|
4
4
|
|
5
5
|
rule amount
|
6
|
-
'(' ws* amount:
|
7
|
-
'[' ws* amount:
|
8
|
-
'{' ws* amount:
|
9
|
-
amount:
|
6
|
+
'(' ws* amount:amount_simple ws* ')' <AmountNode> /
|
7
|
+
'[' ws* amount:amount_simple ws* ']' <AmountNode> /
|
8
|
+
'{' ws* amount:amount_simple ws* '}' <AmountNode> /
|
9
|
+
amount:amount_simple <AmountNode>
|
10
10
|
end
|
11
11
|
|
12
|
-
rule
|
12
|
+
rule amount_simple
|
13
13
|
( (
|
14
14
|
'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
|
15
15
|
'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i /
|
16
|
-
'min.'i / 'min'i / 'max.'i / 'max'i
|
16
|
+
'min.'i / 'min'i / 'max.'i / 'max'i / 'c.a.'i / 'ca.'i / 'ca'i
|
17
17
|
) ws* )?
|
18
|
-
|
19
|
-
simple_amount_quantity
|
18
|
+
amount_simple_quantity
|
20
19
|
( ws+ (
|
21
20
|
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
22
21
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
23
22
|
) )?
|
24
23
|
end
|
25
24
|
|
26
|
-
rule
|
27
|
-
|
25
|
+
rule amount_simple_quantity
|
26
|
+
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
|
27
|
+
end
|
28
|
+
|
29
|
+
rule amount_simple_number
|
30
|
+
( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
|
28
31
|
end
|
29
32
|
|
33
|
+
rule amount_simple_unit
|
34
|
+
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
35
|
+
end
|
30
36
|
end
|
31
37
|
end
|
@@ -31,10 +31,18 @@ module FoodIngredientParser::Strict::Grammar
|
|
31
31
|
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
|
32
32
|
end
|
33
33
|
|
34
|
+
rule percent
|
35
|
+
[%٪⁒%﹪]
|
36
|
+
end
|
37
|
+
|
34
38
|
rule number
|
35
39
|
digit+ [,.] digit+ / digit+ ws* fraction / fraction / digit+
|
36
40
|
end
|
37
41
|
|
42
|
+
rule dash
|
43
|
+
[-֊ ‐ ‑ ‒ – — ― ﹘﹣-]
|
44
|
+
end
|
45
|
+
|
38
46
|
rule word
|
39
47
|
abbrev / char+
|
40
48
|
end
|
@@ -50,6 +58,9 @@ module FoodIngredientParser::Strict::Grammar
|
|
50
58
|
# cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq -c | sort -rn
|
51
59
|
# Finally, you can generate the full list using this command:
|
52
60
|
# cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq | sed "s/^/'/;s/$/'i \//"
|
61
|
+
#
|
62
|
+
# Keep this list in sync with {FoodIngredientParser::Loose::Scanner#ABBREVS}.
|
63
|
+
# too bad we can't use a shared array for this - https://groups.google.com/d/msg/treetop-dev/f3NveVHi7Aw/0uUogmLMb8wJ
|
53
64
|
(
|
54
65
|
'a.o.p'i /
|
55
66
|
'b.g.a'i /
|
@@ -5,14 +5,14 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include IngredientSimple
|
6
6
|
|
7
7
|
rule ingredient_nested
|
8
|
-
( ing:ingredient_simple ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
|
9
|
-
( ing:ingredient_simple ws* '(' contains:ingredient_nested_in ws* ')' ws* amount:amount <NestedIngredientNode> ) /
|
10
|
-
( ing:ingredient_simple_with_amount ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark <NestedIngredientNode> ) /
|
11
|
-
( ing:ingredient_simple_with_amount ws* '(' contains:ingredient_nested_in ws* ')' <NestedIngredientNode> ) /
|
12
|
-
( ing:ingredient_simple ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
|
13
|
-
( ing:ingredient_simple ws* '[' contains:ingredient_nested_in ws* ']' ws* amount:amount <NestedIngredientNode> ) /
|
14
|
-
( ing:ingredient_simple_with_amount ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark <NestedIngredientNode> ) /
|
15
|
-
( ing:ingredient_simple_with_amount ws* '[' contains:ingredient_nested_in ws* ']' <NestedIngredientNode> )
|
8
|
+
( ing:ingredient_simple (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
|
9
|
+
( ing:ingredient_simple (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws* amount:amount <NestedIngredientNode> ) /
|
10
|
+
( ing:ingredient_simple_with_amount (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' ws? mark:mark <NestedIngredientNode> ) /
|
11
|
+
( ing:ingredient_simple_with_amount (ws* ':')? ws* '(' contains:ingredient_nested_in ws* ')' <NestedIngredientNode> ) /
|
12
|
+
( ing:ingredient_simple (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark ws* amount:amount <NestedIngredientNode> ) /
|
13
|
+
( ing:ingredient_simple (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws* amount:amount <NestedIngredientNode> ) /
|
14
|
+
( ing:ingredient_simple_with_amount (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' ws? mark:mark <NestedIngredientNode> ) /
|
15
|
+
( ing:ingredient_simple_with_amount (ws* ':')? ws* '[' contains:ingredient_nested_in ws* ']' <NestedIngredientNode> )
|
16
16
|
end
|
17
17
|
|
18
18
|
rule ingredient_nested_in
|
@@ -5,9 +5,11 @@ module FoodIngredientParser::Strict::Grammar
|
|
5
5
|
include Ingredient
|
6
6
|
|
7
7
|
rule list_coloned
|
8
|
-
contains:( ( ws* list_coloned_ingredient ws* '.' )+ list_coloned_ingredient
|
9
|
-
contains:( ( ws* list_coloned_ingredient ws* '
|
10
|
-
contains:(
|
8
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
|
9
|
+
contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
|
10
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
|
11
|
+
contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
|
12
|
+
contains:( ws* list_coloned_ingredient ) <ListNode>
|
11
13
|
end
|
12
14
|
|
13
15
|
rule list_coloned_inner_list
|
@@ -47,7 +47,7 @@ module FoodIngredientParser::Strict
|
|
47
47
|
h[:name] = name.text_value if respond_to?(:name)
|
48
48
|
h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
|
49
49
|
h[:name] = h[:name] + post.text_value if respond_to?(:post)
|
50
|
-
h[:
|
50
|
+
h[:marks] = [mark.text_value] if respond_to?(:mark) && mark.text_value != ''
|
51
51
|
h
|
52
52
|
end
|
53
53
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.pre.
|
4
|
+
version: 1.0.0.pre.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
@@ -48,6 +48,7 @@ files:
|
|
48
48
|
- lib/food_ingredient_parser/loose/scanner.rb
|
49
49
|
- lib/food_ingredient_parser/loose/transform/amount.rb
|
50
50
|
- lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
|
51
|
+
- lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
|
51
52
|
- lib/food_ingredient_parser/strict/grammar.rb
|
52
53
|
- lib/food_ingredient_parser/strict/grammar/amount.treetop
|
53
54
|
- lib/food_ingredient_parser/strict/grammar/common.treetop
|