food_ingredient_parser 1.0.0.pre.3 → 1.0.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -0
- data/food_ingredient_parser.gemspec +1 -1
- data/lib/food_ingredient_parser/grammar/amount.treetop +6 -6
- data/lib/food_ingredient_parser/grammar/common.treetop +1 -1
- data/lib/food_ingredient_parser/nodes.rb +3 -0
- data/lib/food_ingredient_parser/parser.rb +7 -4
- data/lib/food_ingredient_parser/to_html.rb +43 -0
- data/lib/food_ingredient_parser/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b180a987ae477627cd2046f967d54eeb160fc7ab
|
|
4
|
+
data.tar.gz: '0981287cb5348c58a5fc2e0fda39c470a0b10746'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e11033d79eff439544d71c789981361c98192d767687148cca73e02308f288aac474b84f8ffe1b2a08901df481e4a7c03c162bbcc8ed6f5c030b48f12e4ebb61
|
|
7
|
+
data.tar.gz: a44018d0bc1c77d7907e5fa31cf5fe7e919c2e20bf5480b07bbd0de33ff2e6aec4a2a7981b8a22906c574f9140bede380cc87a7917fc84a0f6685190d26925c7
|
data/README.md
CHANGED
|
@@ -108,6 +108,28 @@ parsed 35 (100.0%), no result 0 (0.0%)
|
|
|
108
108
|
|
|
109
109
|
If you want to use the output in (shell)scripts, the options `-e -c` may be quite useful.
|
|
110
110
|
|
|
111
|
+
## `to_html`
|
|
112
|
+
|
|
113
|
+
When ingredient lists are entered manually, it can be very useful to show how the text is
|
|
114
|
+
recognized. This can help understanding why a certain ingredients list cannot be parsed.
|
|
115
|
+
|
|
116
|
+
For this you can use the `to_html` method on the parsed output, which returns the original
|
|
117
|
+
text, augmented with CSS classes for different parts.
|
|
118
|
+
|
|
119
|
+
```ruby
|
|
120
|
+
require 'food_ingredient_parser'
|
|
121
|
+
|
|
122
|
+
parsed = FoodIngredientParser::Parser.new.parse("Saus (10% tomaat*, zout). * = bio")
|
|
123
|
+
puts parsed.to_html
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
```html
|
|
127
|
+
<span class='name'>Saus</span> (<span class='amount'>10%</span>
|
|
128
|
+
<span class='name'>tomaat</span><span class='mark'>*</span>,
|
|
129
|
+
<span class='name'>zout</span>). <span class='note'>* = bio</span>
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
|
|
111
133
|
## Test data
|
|
112
134
|
|
|
113
135
|
[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
|
|
@@ -4,7 +4,7 @@ require 'food_ingredient_parser/version'
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = 'food_ingredient_parser'
|
|
6
6
|
s.version = FoodIngredientParser::VERSION
|
|
7
|
-
s.date = '2018-
|
|
7
|
+
s.date = '2018-09-05'
|
|
8
8
|
s.summary = 'Parser for ingredient lists found on food products.'
|
|
9
9
|
s.authors = ['wvengen']
|
|
10
10
|
s.email = ['dev-ruby@willem.engen.nl']
|
|
@@ -11,19 +11,19 @@ module FoodIngredientParser::Grammar
|
|
|
11
11
|
|
|
12
12
|
rule simple_amount
|
|
13
13
|
( (
|
|
14
|
-
'of which' / 'at least' /
|
|
15
|
-
'waarvan' / 'ten minste' / 'tenminste' / 'minimaal'
|
|
14
|
+
'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
|
|
15
|
+
'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i
|
|
16
16
|
) ws* )?
|
|
17
|
-
[
|
|
17
|
+
[±∓~∼∽≂≃≈≲≤<>≥≳]? ws*
|
|
18
18
|
simple_amount_quantity
|
|
19
19
|
( ws+ (
|
|
20
|
-
'minimum' /
|
|
21
|
-
'minimaal' / 'minimum'
|
|
20
|
+
'minimum'i /
|
|
21
|
+
'minimaal'i / 'minimum'i
|
|
22
22
|
) )?
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
rule simple_amount_quantity
|
|
26
|
-
number ( ws* '-' ws* number )? ws* ( '%' / 'g' / 'mg' / 'gram' / 'ml' )
|
|
26
|
+
number ( ws* '-' ws* number )? ws* ( '%' / 'g'i / 'mg'i / 'gram'i / 'ml'i )
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
end
|
|
@@ -20,7 +20,7 @@ module FoodIngredientParser::Grammar
|
|
|
20
20
|
|
|
21
21
|
rule mark
|
|
22
22
|
# mark referencing a footnote
|
|
23
|
-
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [†‡•°#^] / '*'+
|
|
23
|
+
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [†‡•°#^] / '*'+ / '(' ws* ( [†‡•°#^] / '*'+ ) ws* ')'
|
|
24
24
|
end
|
|
25
25
|
|
|
26
26
|
rule digit
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require 'treetop/runtime'
|
|
2
|
+
require_relative 'to_html'
|
|
2
3
|
|
|
3
4
|
# Needs to be in grammar namespace so Treetop can find the nodes.
|
|
4
5
|
module FoodIngredientParser::Grammar
|
|
@@ -18,6 +19,8 @@ module FoodIngredientParser::Grammar
|
|
|
18
19
|
|
|
19
20
|
# Root object, contains everything else.
|
|
20
21
|
class RootNode < SyntaxNode
|
|
22
|
+
include FoodIngredientParser::ToHtml
|
|
23
|
+
|
|
21
24
|
def to_h
|
|
22
25
|
h = { contains: contains.to_a }
|
|
23
26
|
if notes && notes_ary = to_a_deep(notes, NoteNode)&.map(&:text_value)
|
|
@@ -5,20 +5,23 @@ module FoodIngredientParser
|
|
|
5
5
|
|
|
6
6
|
# @!attribute [r] parser
|
|
7
7
|
# @return [Treetop::Runtime::CompiledParser] low-level parser object
|
|
8
|
+
# @note This attribute is there for convenience, but may change in the future. Take care.
|
|
8
9
|
attr_reader :parser
|
|
9
10
|
|
|
10
11
|
# Create a new food ingredient parser
|
|
11
|
-
# @return [FoodIngredientParser]
|
|
12
|
+
# @return [FoodIngredientParser::Parser]
|
|
12
13
|
def initialize
|
|
13
14
|
@parser = Grammar::RootParser.new
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
# Parse food ingredient list text into a structured representation.
|
|
18
|
+
#
|
|
17
19
|
# @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
|
|
18
|
-
# @return [
|
|
19
|
-
|
|
20
|
+
# @return [FoodIngredientParser::Grammar::RootNode] structured representation of food ingredients
|
|
21
|
+
# @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
|
|
22
|
+
def parse(s, clean: true, **options)
|
|
20
23
|
s = clean(s) if clean
|
|
21
|
-
@parser.parse(s)
|
|
24
|
+
@parser.parse(s, **options)
|
|
22
25
|
end
|
|
23
26
|
|
|
24
27
|
private
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
require 'cgi'
|
|
2
|
+
|
|
3
|
+
# Adds HTML output functionality to a Treetop Node.
|
|
4
|
+
#
|
|
5
|
+
# The node needs to provide a {#to_h} method (for {#to_html_h}).
|
|
6
|
+
#
|
|
7
|
+
module FoodIngredientParser::ToHtml
|
|
8
|
+
|
|
9
|
+
# Markup original ingredients list text in HTML.
|
|
10
|
+
#
|
|
11
|
+
# The input text is returned as HTML, augmented with CSS classes
|
|
12
|
+
# on +span+s for +name+, +amount+, +mark+ and +note+.
|
|
13
|
+
#
|
|
14
|
+
# @return [String] HTML representation of ingredient list.
|
|
15
|
+
def to_html
|
|
16
|
+
node_to_html(self)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def node_to_html(node, cls=nil)
|
|
22
|
+
el_cls = {} # map of node instances to class names for contained elements
|
|
23
|
+
terminal = node.terminal? # whether to look at children elements or not
|
|
24
|
+
|
|
25
|
+
if node.is_a?(FoodIngredientParser::Grammar::AmountNode)
|
|
26
|
+
cls ||= "amount"
|
|
27
|
+
elsif node.is_a?(FoodIngredientParser::Grammar::NoteNode)
|
|
28
|
+
cls ||= "note"
|
|
29
|
+
terminal = true # NoteNodes may contain other NoteNodes, we want it flat.
|
|
30
|
+
elsif node.is_a?(FoodIngredientParser::Grammar::IngredientNode)
|
|
31
|
+
el_cls[node.name] = "name" if node.respond_to?(:name)
|
|
32
|
+
el_cls[node.mark] = "mark" if node.respond_to?(:mark)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
val = if terminal
|
|
36
|
+
CGI.escapeHTML(node.text_value)
|
|
37
|
+
else
|
|
38
|
+
node.elements.map {|el| node_to_html(el, el_cls[el]) }.join("")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
cls ? "<span class='#{cls}'>#{val}</span>" : val
|
|
42
|
+
end
|
|
43
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: food_ingredient_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.0.pre.
|
|
4
|
+
version: 1.0.0.pre.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- wvengen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-09-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: treetop
|
|
@@ -55,6 +55,7 @@ files:
|
|
|
55
55
|
- lib/food_ingredient_parser/grammar/root.treetop
|
|
56
56
|
- lib/food_ingredient_parser/nodes.rb
|
|
57
57
|
- lib/food_ingredient_parser/parser.rb
|
|
58
|
+
- lib/food_ingredient_parser/to_html.rb
|
|
58
59
|
- lib/food_ingredient_parser/version.rb
|
|
59
60
|
homepage: https://github.com/q-m/food-ingredient-parser-ruby
|
|
60
61
|
licenses:
|