food_ingredient_parser 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/food_ingredient_parser/loose/node.rb +11 -4
- data/lib/food_ingredient_parser/loose/scanner.rb +20 -11
- data/lib/food_ingredient_parser/loose/transform/amount.rb +17 -9
- data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +2 -1
- data/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +17 -11
- data/lib/food_ingredient_parser/strict/grammar/list.treetop +7 -5
- data/lib/food_ingredient_parser/strict/grammar/list_coloned.treetop +1 -0
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: deb4cb55b3d5c41f02171e031fd11cc996cf2e8df9f074aa163efdff58baa6b0
|
4
|
+
data.tar.gz: 63dc1b52a15e6f70114cca9ed5d8a585a1f475d70131e4852310cf8755558dca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cebae488578f1e00f8d905f34d39cef653cdcb4922d26878687afb3463ae3c24ca6592a6f19ac482b5a2f08e95feef342c9631da69acb99feea4e1a81269057
|
7
|
+
data.tar.gz: a7c8b98a5c3fd3aee8962e8f31cd9a0ede791e8d7c7193bfbb1ba2524be057bd7b38499973cd6ab6fea58020161ba6f8615dc5eef2191a87d45afec4662fa264
|
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
|
|
5
5
|
class Node
|
6
6
|
include ToHtml
|
7
7
|
|
8
|
-
attr_accessor :
|
8
|
+
attr_accessor :name_parts, :mark, :amount, :contains, :notes
|
9
9
|
attr_reader :input, :interval, :auto_close
|
10
10
|
|
11
11
|
def initialize(input, interval, auto_close: false)
|
@@ -14,7 +14,8 @@ module FoodIngredientParser::Loose
|
|
14
14
|
@auto_close = auto_close
|
15
15
|
@contains = []
|
16
16
|
@notes = []
|
17
|
-
@
|
17
|
+
@name_parts = []
|
18
|
+
@mark = @amount = nil
|
18
19
|
end
|
19
20
|
|
20
21
|
def ends(index)
|
@@ -31,7 +32,8 @@ module FoodIngredientParser::Loose
|
|
31
32
|
|
32
33
|
def to_h
|
33
34
|
r = {}
|
34
|
-
|
35
|
+
_name = name
|
36
|
+
r[:name] = _name if _name
|
35
37
|
r[:marks] = [mark.text_value.strip] if mark
|
36
38
|
r[:amount] = amount.text_value.strip if amount
|
37
39
|
r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
|
@@ -39,6 +41,11 @@ module FoodIngredientParser::Loose
|
|
39
41
|
r
|
40
42
|
end
|
41
43
|
|
44
|
+
def name
|
45
|
+
strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
|
46
|
+
return strings.any? ? strings.join(" ") : nil
|
47
|
+
end
|
48
|
+
|
42
49
|
def inspect(indent="", variant="")
|
43
50
|
inspect_self(indent, variant) +
|
44
51
|
inspect_children(indent)
|
@@ -47,7 +54,7 @@ module FoodIngredientParser::Loose
|
|
47
54
|
def inspect_self(indent="", variant="")
|
48
55
|
[
|
49
56
|
indent + "Node#{variant} interval=#{@interval}",
|
50
|
-
name ? "name=#{name.
|
57
|
+
name ? "name=#{name.inspect}" : nil,
|
51
58
|
mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
|
52
59
|
amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
|
53
60
|
auto_close ? "auto_close" : nil
|
@@ -33,8 +33,9 @@ module FoodIngredientParser::Loose
|
|
33
33
|
|
34
34
|
def initialize(s, index: 0)
|
35
35
|
@s = s # input string
|
36
|
-
@i = index # current index in string
|
36
|
+
@i = index # current index in string, the iterator looks at this character
|
37
37
|
@cur = nil # current node we're populating
|
38
|
+
@curifree = nil # last index in string for current node that we haven't added to a child node yet
|
38
39
|
@ancestors = [Node.new(@s, @i)] # nesting hierarchy
|
39
40
|
@iterator = :beginning # scan_iteration_<iterator> to use for parsing
|
40
41
|
@dest = :contains # append current node to this attribute on parent
|
@@ -79,6 +80,7 @@ module FoodIngredientParser::Loose
|
|
79
80
|
# after bracket check for 'and' to not lose text
|
80
81
|
if is_and_sep?(@i+1)
|
81
82
|
@i += and_sep_len(@i+1)
|
83
|
+
@curifree = @i # don't include 'and' in cur name
|
82
84
|
add_child
|
83
85
|
end
|
84
86
|
elsif is_notes_start? # usually a dot marks the start of notes
|
@@ -147,7 +149,11 @@ module FoodIngredientParser::Loose
|
|
147
149
|
end
|
148
150
|
|
149
151
|
def cur
|
150
|
-
|
152
|
+
if !@cur
|
153
|
+
@cur ||= Node.new(@s, @i)
|
154
|
+
@curifree = @i
|
155
|
+
end
|
156
|
+
@cur
|
151
157
|
end
|
152
158
|
|
153
159
|
def is_sep?(chars: SEP_CHARS)
|
@@ -201,16 +207,19 @@ module FoodIngredientParser::Loose
|
|
201
207
|
cur.ends(@i-1)
|
202
208
|
parent.send(@dest) << cur
|
203
209
|
@cur = nil
|
210
|
+
@curifree = nil
|
204
211
|
end
|
205
212
|
|
206
213
|
def open_parent(**options)
|
207
214
|
name_until_here
|
208
215
|
@ancestors << cur
|
209
216
|
@cur = Node.new(@s, @i + 1, **options)
|
217
|
+
@curifree = @i + 1
|
210
218
|
end
|
211
219
|
|
212
220
|
def close_parent
|
213
221
|
return unless @ancestors.count > 1
|
222
|
+
@curifree = @i + 1
|
214
223
|
@cur = @ancestors.pop
|
215
224
|
while @cur.auto_close
|
216
225
|
add_child
|
@@ -227,15 +236,15 @@ module FoodIngredientParser::Loose
|
|
227
236
|
end
|
228
237
|
|
229
238
|
def name_until_here
|
230
|
-
cur
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
+
return unless @curifree # no cur started yet
|
240
|
+
i, j = @curifree, @i - 1
|
241
|
+
i += mark_len(i) # skip any mark in front
|
242
|
+
# Set name if there is any. There is one corner-case that needs to be avoided when
|
243
|
+
# a nesting was opened without a name, which would set the name to the nesting text.
|
244
|
+
# In this case, the name starts with an open-nesting symbol, which should never happen.
|
245
|
+
if j >= i && !"([:".include?(@s[i])
|
246
|
+
cur.name_parts << Node.new(@s, i .. j)
|
247
|
+
@curifree = @i
|
239
248
|
end
|
240
249
|
end
|
241
250
|
|
@@ -29,18 +29,26 @@ module FoodIngredientParser::Loose
|
|
29
29
|
|
30
30
|
# Extract amount from name, if any.
|
31
31
|
def transform_name(node = @node)
|
32
|
-
if !node.amount
|
33
|
-
|
32
|
+
if !node.amount
|
33
|
+
node.name_parts.each_with_index do |name, i|
|
34
|
+
parsed = parse_amount(name.text_value)
|
35
|
+
next unless parsed
|
36
|
+
offset = name.interval.first
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
amount = parsed.amount.amount
|
39
|
+
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
|
37
40
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
name = parsed.respond_to?(:name) && parsed.name
|
42
|
+
node.name_parts[i] = if name && name.interval.count > 0
|
43
|
+
Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
|
44
|
+
else
|
45
|
+
nil
|
46
|
+
end
|
47
|
+
# found an amount, stop looking in other parts
|
48
|
+
break
|
43
49
|
end
|
50
|
+
# remove cleared name parts
|
51
|
+
node.name_parts.reject!(&:nil?)
|
44
52
|
end
|
45
53
|
|
46
54
|
# recursively transform contained nodes
|
@@ -42,7 +42,8 @@ module FoodIngredientParser::Loose
|
|
42
42
|
# Apply recursively. Do it before processing to handle multiple depth levels of missing names.
|
43
43
|
transform_children!(child) if child.contains.any?
|
44
44
|
|
45
|
-
|
45
|
+
name = child.name
|
46
|
+
if name.nil? || name == ''
|
46
47
|
# Name is empty, we need to do something.
|
47
48
|
if prev
|
48
49
|
# there is a previous ingredient: move children to new parent
|
@@ -29,21 +29,27 @@ module FoodIngredientParser::Loose
|
|
29
29
|
def transform_node!(node)
|
30
30
|
if node.contains.any?
|
31
31
|
node.contains.each {|n| transform_node!(n) }
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
else
|
33
|
+
node.name_parts.each_with_index do |name, name_index|
|
34
|
+
if m = MATCH_RE.match(name.text_value)
|
35
|
+
i = 0
|
36
|
+
while m = name.text_value.match(SPLIT_RE, i)
|
37
|
+
node.contains << new_node(name, i, m.begin(0)-1)
|
38
|
+
i = m.end(0)
|
39
|
+
end
|
40
|
+
node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
|
41
|
+
node.name_parts[name_index] = nil
|
42
|
+
end
|
37
43
|
end
|
38
|
-
|
39
|
-
node.
|
44
|
+
# remove cleared name parts
|
45
|
+
node.name_parts.reject!(&:nil?)
|
40
46
|
end
|
41
47
|
end
|
42
48
|
|
43
|
-
def new_node(
|
44
|
-
offset =
|
45
|
-
new_node = Node.new(
|
46
|
-
new_node.
|
49
|
+
def new_node(name, begins, ends)
|
50
|
+
offset = name.interval.first
|
51
|
+
new_node = Node.new(name.input, offset + begins .. offset + ends)
|
52
|
+
new_node.name_parts = [Node.new(name.input, new_node.interval)]
|
47
53
|
new_node
|
48
54
|
end
|
49
55
|
end
|
@@ -4,11 +4,13 @@ module FoodIngredientParser::Strict::Grammar
|
|
4
4
|
include Ingredient
|
5
5
|
|
6
6
|
rule list
|
7
|
-
contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
8
|
-
contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
9
|
-
contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
10
|
-
contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? )
|
11
|
-
contains:(
|
7
|
+
contains:(ingredient ( ws* '|' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
8
|
+
contains:(ingredient ( ws* ';' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
9
|
+
contains:(ingredient ( ws* ',' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
10
|
+
contains:(ingredient ( ws* '.' ws* ingredient )+ ( ws+ and ws+ ingredient )? ) <ListNode> /
|
11
|
+
contains:(ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
|
12
|
+
contains:(ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
|
13
|
+
contains:(ingredient ( ws+ and ws+ ingredient )? ) <ListNode>
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -17,6 +17,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
17
17
|
end
|
18
18
|
|
19
19
|
rule list_coloned_inner_list
|
20
|
+
contains:( ingredient_simple_e_number ( ws* '/' ws* ingredient_simple_e_number )+ ) <ListNode> /
|
20
21
|
contains:( ingredient_simple_e_number ( ws* dash ws* ingredient_simple_e_number )+ ) <ListNode> /
|
21
22
|
contains:( ingredient ( ws* ',' ws* ingredient )* ) <ListNode>
|
22
23
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|