food_ingredient_parser 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/food_ingredient_parser/loose/node.rb +4 -0
- data/lib/food_ingredient_parser/loose/scanner.rb +5 -4
- data/lib/food_ingredient_parser/loose/to_html.rb +66 -0
- data/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +6 -1
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 629792af43b06c646b98c2ab8ee136895123e3e5
|
4
|
+
data.tar.gz: 6abbe88e79943cedb8b8776146aab6fdffd427e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1728c6edc995f6a5a18b82eae0e217fa18814c75f1b34d33d52abc4f593428020ba6a425ea526519036a5d4a412a80b43aae55147b2a07bae0e0979394b26bd
|
7
|
+
data.tar.gz: a9f43ad8e20ab867293e1db59aeb5b7bd6147a6e560adb58159f345d441c6445be453b7eb4ba01bdd4169ae951444e4bf90a6e7873cf85cfa7557b4ac4859d36
|
data/README.md
CHANGED
@@ -121,7 +121,7 @@ When ingredient lists are entered manually, it can be very useful to show how th
|
|
121
121
|
recognized. This can help understanding why a certain ingredients list cannot be parsed.
|
122
122
|
|
123
123
|
For this you can use the `to_html` method on the parsed output, which returns the original
|
124
|
-
text, augmented with CSS classes for different parts.
|
124
|
+
text, augmented with CSS classes for different parts.
|
125
125
|
|
126
126
|
```ruby
|
127
127
|
require 'food_ingredient_parser'
|
@@ -178,8 +178,8 @@ Even though the strict parser would not give a result, the loose parser returns:
|
|
178
178
|
|
179
179
|
From the 1.0.0 release, the main interface will be stable. This comprises the two parser's `parse`
|
180
180
|
methods (incl. documented options), its `nil` result when parsing failed, and the parsed output's
|
181
|
-
`to_h` and `to_html` methods
|
182
|
-
|
181
|
+
`to_h` and `to_html` methods. Please note that parsed node trees may be subject to change, even within
|
182
|
+
a major release. Within a minor release, node trees are expected to remain stable.
|
183
183
|
|
184
184
|
So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can lock your version
|
185
185
|
to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
|
@@ -149,9 +149,10 @@ module FoodIngredientParser::Loose
|
|
149
149
|
|
150
150
|
def is_notes_start?
|
151
151
|
# @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
|
152
|
-
|
153
|
-
|
154
|
-
( @s[@i
|
152
|
+
ml = mark_len
|
153
|
+
if ( is_mark? && @s[@i+ml .. -1] =~ /\A\s*=/ ) || # "* = Biologisch"
|
154
|
+
( is_mark? && @s[@i-1] =~ /\s/ && @s[@i+ml .. -1] =~ /\A\s*\w/ ) || # " **Biologisch"
|
155
|
+
( @s[@i..-1] =~ NOTE_RE ) # "E=", "Kan sporen van", ...
|
155
156
|
@i -= 1 # we want to include the mark in the note
|
156
157
|
true
|
157
158
|
# End of sentence
|
@@ -196,7 +197,7 @@ module FoodIngredientParser::Loose
|
|
196
197
|
cur.name ||= begin
|
197
198
|
i, j = cur.interval.first, @i - 1
|
198
199
|
i += mark_len(i) # skip any mark in front
|
199
|
-
Node.new(@s, i .. j) if j
|
200
|
+
Node.new(@s, i .. j) if j >= i
|
200
201
|
end
|
201
202
|
end
|
202
203
|
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
|
3
|
+
# Adds HTML output functionality to a Node.
|
4
|
+
#
|
5
|
+
module FoodIngredientParser::Loose
|
6
|
+
module ToHtml
|
7
|
+
|
8
|
+
# Markup original ingredients list text in HTML.
|
9
|
+
#
|
10
|
+
# The input text is returned as HTML, augmented with CSS classes
|
11
|
+
# on +span+s for +name+, +amount+, +mark+ and +note+.
|
12
|
+
#
|
13
|
+
# @return [String] HTML representation of ingredient list.
|
14
|
+
def to_html
|
15
|
+
node_to_html(self)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def node_to_html(node, depth=0)
|
21
|
+
children = [*node.contains, *node.notes, node.name, node.amount, node.mark].compact
|
22
|
+
children.sort_by! {|n| n.interval.first }
|
23
|
+
|
24
|
+
html = ""
|
25
|
+
last_idx = node.interval.first - 1
|
26
|
+
children.each do |child|
|
27
|
+
# we don't have nodes for all characters, make sure they are in the output
|
28
|
+
if child.interval.first - 1 > last_idx
|
29
|
+
html += CGI.escapeHTML(node.input[last_idx + 1 .. child.interval.first - 1])
|
30
|
+
last_idx = child.interval.first - 1
|
31
|
+
end
|
32
|
+
|
33
|
+
if child == node.name
|
34
|
+
html += node_to_html_single(child, "name")
|
35
|
+
last_idx = child.interval.last
|
36
|
+
elsif child == node.amount
|
37
|
+
html += node_to_html_single(child, "amount")
|
38
|
+
last_idx = child.interval.last
|
39
|
+
elsif child == node.mark
|
40
|
+
html += node_to_html_single(child, "mark")
|
41
|
+
last_idx = child.interval.last
|
42
|
+
elsif node.notes.include?(child)
|
43
|
+
html += node_to_html_single(child, "note")
|
44
|
+
last_idx = child.interval.last
|
45
|
+
elsif node.contains.include?(child)
|
46
|
+
cls = "depth#{depth}"
|
47
|
+
cls = "contains #{cls}" if depth > 0
|
48
|
+
html += "<span class='#{cls}'>#{node_to_html(child, depth + 1)}</span>"
|
49
|
+
last_idx = child.interval.last
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# include any trailing characters
|
54
|
+
if children.any? && last_idx < node.interval.last
|
55
|
+
html += CGI.escapeHTML(node.input[last_idx + 1 .. node.interval.last])
|
56
|
+
end
|
57
|
+
|
58
|
+
html
|
59
|
+
end
|
60
|
+
|
61
|
+
def node_to_html_single(node, cls=nil)
|
62
|
+
ws1, txt, ws2 = node.text_value.match(/\A(\s*)(.*?)(\s*)\z/).captures.map {|s| CGI.escapeHTML(s)}
|
63
|
+
cls && txt.size > 0 ? "#{ws1}<span class='#{cls}'>#{txt}</span>#{ws2}" : "#{ws1}#{txt}#{ws2}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -8,7 +8,9 @@ module FoodIngredientParser::Loose
|
|
8
8
|
#
|
9
9
|
# When a contained node is found which doesn't have a name:
|
10
10
|
# * For the amount (if any): ignore it (as it's often ambiguous which ingredient it belongs to)
|
11
|
-
# * For the marks (if any)
|
11
|
+
# * For the marks (if any)
|
12
|
+
# - if the node has no siblings and no containing ingredients, add the mark to the parent (if any)
|
13
|
+
# - else ignore it (we might instead add it to the containing ingredients)
|
12
14
|
# * For the containing ingredients (if any):
|
13
15
|
# - if the previous ingredient is present and doesn't contain ingredients already,
|
14
16
|
# assume the current contained ingredients are actually part of the previous ingredient.
|
@@ -45,6 +47,9 @@ module FoodIngredientParser::Loose
|
|
45
47
|
if prev
|
46
48
|
# there is a previous ingredient: move children to new parent
|
47
49
|
prev.contains.push(*child.contains)
|
50
|
+
elsif child.mark && !node.mark && child.contains.empty? && !child.amount
|
51
|
+
# this is just a mark without siblings: it's a mark for its parent
|
52
|
+
node.mark = child.mark
|
48
53
|
else
|
49
54
|
# there is no previous ingredient: move children one level up
|
50
55
|
new_contains.push(*child.contains)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-09-
|
11
|
+
date: 2018-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- lib/food_ingredient_parser/loose/node.rb
|
47
47
|
- lib/food_ingredient_parser/loose/parser.rb
|
48
48
|
- lib/food_ingredient_parser/loose/scanner.rb
|
49
|
+
- lib/food_ingredient_parser/loose/to_html.rb
|
49
50
|
- lib/food_ingredient_parser/loose/transform/amount.rb
|
50
51
|
- lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
|
51
52
|
- lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
|