food_ingredient_parser 1.0.0.pre.9 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -0
- data/lib/food_ingredient_parser/loose/scanner.rb +19 -14
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +13 -6
- data/lib/food_ingredient_parser/strict/grammar/ingredient_coloned.treetop +1 -1
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c529b63bd3a9f6139ed10663b2cef70ff3d1dc6
|
4
|
+
data.tar.gz: a8effec91559e15920794c61b08a486e572032cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35f27c7d83effc16962a65ac4c8c09fb5694373dbd3d2745c434c37ddcf3fc466264c0f10cbe5054876517839a81bf75ab0b1b9876098c1b04b5312138e06ea1
|
7
|
+
data.tar.gz: 28e517777928262b45836d899ff919f725e09ab6e116fd9f58545262b7ae7151821ae683e6d33146d3b5e20f5c02ad7217c4985267460645b0ba80e1ccc19751
|
data/README.md
CHANGED
@@ -174,6 +174,17 @@ Even though the strict parser would not give a result, the loose parser returns:
|
|
174
174
|
}
|
175
175
|
```
|
176
176
|
|
177
|
+
## Compatibility
|
178
|
+
|
179
|
+
From the 1.0.0 release, the main interface will be stable. This comprises the two parser's `parse`
|
180
|
+
methods (incl. documented options), its `nil` result when parsing failed, and the parsed output's
|
181
|
+
`to_h` and `to_html` methods (where available). Please note that parsed node trees may be subject to
|
182
|
+
change, even within a major release. Within a minor release, node trees are expected to remain stable.
|
183
|
+
|
184
|
+
So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can lock your version
|
185
|
+
to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
|
186
|
+
upgrade to `1.1`.
|
187
|
+
|
177
188
|
## Test data
|
178
189
|
|
179
190
|
[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
|
@@ -6,14 +6,14 @@ module FoodIngredientParser::Loose
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
7
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
|
-
NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
|
9
|
+
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
|
-
ABBREV_RE = Regexp.union(
|
11
|
+
ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
|
12
12
|
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
13
13
|
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
14
|
-
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
|
14
|
+
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
|
15
15
|
min max ca
|
16
|
-
].map {|s| /\A#{Regexp.escape(s)}\b\.?/})
|
16
|
+
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
|
17
17
|
|
18
18
|
def initialize(s, index: 0)
|
19
19
|
@s = s # input string
|
@@ -68,7 +68,8 @@ module FoodIngredientParser::Loose
|
|
68
68
|
add_child
|
69
69
|
elsif ":".include?(c) # another open nesting
|
70
70
|
if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
|
71
|
-
# ignore
|
71
|
+
# ignore colon before an open bracket, then it's a regular nesting
|
72
|
+
name_until_here
|
72
73
|
else
|
73
74
|
open_parent(auto_close: true)
|
74
75
|
@iterator = :colon
|
@@ -129,16 +130,16 @@ module FoodIngredientParser::Loose
|
|
129
130
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
130
131
|
end
|
131
132
|
|
132
|
-
def is_mark?
|
133
|
-
mark_len > 0 && @s[
|
133
|
+
def is_mark?(i = @i)
|
134
|
+
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
134
135
|
end
|
135
136
|
|
136
|
-
def mark_len
|
137
|
-
|
138
|
-
while @s[
|
139
|
-
|
137
|
+
def mark_len(i = @i)
|
138
|
+
j = i
|
139
|
+
while @s[j] && MARK_CHARS.include?(@s[j])
|
140
|
+
j += 1
|
140
141
|
end
|
141
|
-
|
142
|
+
j - i
|
142
143
|
end
|
143
144
|
|
144
145
|
def abbrev_len
|
@@ -162,8 +163,8 @@ module FoodIngredientParser::Loose
|
|
162
163
|
end
|
163
164
|
|
164
165
|
def add_child
|
166
|
+
name_until_here
|
165
167
|
cur.ends(@i-1)
|
166
|
-
cur.name ||= Node.new(@s, cur.interval)
|
167
168
|
parent.send(@dest) << cur
|
168
169
|
@cur = nil
|
169
170
|
end
|
@@ -192,7 +193,11 @@ module FoodIngredientParser::Loose
|
|
192
193
|
end
|
193
194
|
|
194
195
|
def name_until_here
|
195
|
-
cur.name ||=
|
196
|
+
cur.name ||= begin
|
197
|
+
i, j = cur.interval.first, @i - 1
|
198
|
+
i += mark_len(i) # skip any mark in front
|
199
|
+
Node.new(@s, i .. j) if j > i
|
200
|
+
end
|
196
201
|
end
|
197
202
|
|
198
203
|
def dot_is_not_sep?
|
@@ -44,7 +44,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
44
44
|
end
|
45
45
|
|
46
46
|
rule word
|
47
|
-
abbrev / char+
|
47
|
+
abbrev / word_complex / char+
|
48
48
|
end
|
49
49
|
|
50
50
|
rule and
|
@@ -104,13 +104,20 @@ module FoodIngredientParser::Strict::Grammar
|
|
104
104
|
'w.a'i /
|
105
105
|
'w.o'i /
|
106
106
|
'w.v'i /
|
107
|
-
#
|
108
|
-
'vit
|
109
|
-
'denat
|
110
|
-
'N°'i /
|
111
|
-
'°C'i
|
107
|
+
# not auto-generated additions
|
108
|
+
'vit'i /
|
109
|
+
'denat'i
|
112
110
|
)
|
113
111
|
'.'? ![[:alpha:]]
|
114
112
|
end
|
113
|
+
|
114
|
+
rule word_complex
|
115
|
+
# Complex words that contain characters that would otherwise be considered non-words.
|
116
|
+
(
|
117
|
+
'N°'i /
|
118
|
+
'°C'i /
|
119
|
+
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
|
120
|
+
) ![[:alpha:]]
|
121
|
+
end
|
115
122
|
end
|
116
123
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-09-
|
11
|
+
date: 2018-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -81,9 +81,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
81
81
|
version: '0'
|
82
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
83
|
requirements:
|
84
|
-
- - "
|
84
|
+
- - ">="
|
85
85
|
- !ruby/object:Gem::Version
|
86
|
-
version:
|
86
|
+
version: '0'
|
87
87
|
requirements: []
|
88
88
|
rubyforge_project:
|
89
89
|
rubygems_version: 2.6.13
|