food_ingredient_parser 1.0.0.pre.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -0
- data/lib/food_ingredient_parser/loose/scanner.rb +19 -14
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +13 -6
- data/lib/food_ingredient_parser/strict/grammar/ingredient_coloned.treetop +1 -1
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c529b63bd3a9f6139ed10663b2cef70ff3d1dc6
|
4
|
+
data.tar.gz: a8effec91559e15920794c61b08a486e572032cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35f27c7d83effc16962a65ac4c8c09fb5694373dbd3d2745c434c37ddcf3fc466264c0f10cbe5054876517839a81bf75ab0b1b9876098c1b04b5312138e06ea1
|
7
|
+
data.tar.gz: 28e517777928262b45836d899ff919f725e09ab6e116fd9f58545262b7ae7151821ae683e6d33146d3b5e20f5c02ad7217c4985267460645b0ba80e1ccc19751
|
data/README.md
CHANGED
@@ -174,6 +174,17 @@ Even though the strict parser would not give a result, the loose parser returns:
|
|
174
174
|
}
|
175
175
|
```
|
176
176
|
|
177
|
+
## Compatibility
|
178
|
+
|
179
|
+
From the 1.0.0 release, the main interface will be stable. This comprises the two parser's `parse`
|
180
|
+
methods (incl. documented options), its `nil` result when parsing failed, and the parsed output's
|
181
|
+
`to_h` and `to_html` methods (where available). Please note that parsed node trees may be subject to
|
182
|
+
change, even within a major release. Within a minor release, node trees are expected to remain stable.
|
183
|
+
|
184
|
+
So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can lock your version
|
185
|
+
to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
|
186
|
+
upgrade to `1.1`.
|
187
|
+
|
177
188
|
## Test data
|
178
189
|
|
179
190
|
[`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
|
@@ -6,14 +6,14 @@ module FoodIngredientParser::Loose
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
7
|
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
|
-
NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
|
9
|
+
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
|
-
ABBREV_RE = Regexp.union(
|
11
|
+
ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
|
12
12
|
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
13
13
|
i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
|
14
|
-
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
|
14
|
+
p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
|
15
15
|
min max ca
|
16
|
-
].map {|s| /\A#{Regexp.escape(s)}\b\.?/})
|
16
|
+
].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
|
17
17
|
|
18
18
|
def initialize(s, index: 0)
|
19
19
|
@s = s # input string
|
@@ -68,7 +68,8 @@ module FoodIngredientParser::Loose
|
|
68
68
|
add_child
|
69
69
|
elsif ":".include?(c) # another open nesting
|
70
70
|
if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
|
71
|
-
# ignore
|
71
|
+
# ignore colon before an open bracket, then it's a regular nesting
|
72
|
+
name_until_here
|
72
73
|
else
|
73
74
|
open_parent(auto_close: true)
|
74
75
|
@iterator = :colon
|
@@ -129,16 +130,16 @@ module FoodIngredientParser::Loose
|
|
129
130
|
chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
|
130
131
|
end
|
131
132
|
|
132
|
-
def is_mark?
|
133
|
-
mark_len > 0 && @s[
|
133
|
+
def is_mark?(i = @i)
|
134
|
+
mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
|
134
135
|
end
|
135
136
|
|
136
|
-
def mark_len
|
137
|
-
|
138
|
-
while @s[
|
139
|
-
|
137
|
+
def mark_len(i = @i)
|
138
|
+
j = i
|
139
|
+
while @s[j] && MARK_CHARS.include?(@s[j])
|
140
|
+
j += 1
|
140
141
|
end
|
141
|
-
|
142
|
+
j - i
|
142
143
|
end
|
143
144
|
|
144
145
|
def abbrev_len
|
@@ -162,8 +163,8 @@ module FoodIngredientParser::Loose
|
|
162
163
|
end
|
163
164
|
|
164
165
|
def add_child
|
166
|
+
name_until_here
|
165
167
|
cur.ends(@i-1)
|
166
|
-
cur.name ||= Node.new(@s, cur.interval)
|
167
168
|
parent.send(@dest) << cur
|
168
169
|
@cur = nil
|
169
170
|
end
|
@@ -192,7 +193,11 @@ module FoodIngredientParser::Loose
|
|
192
193
|
end
|
193
194
|
|
194
195
|
def name_until_here
|
195
|
-
cur.name ||=
|
196
|
+
cur.name ||= begin
|
197
|
+
i, j = cur.interval.first, @i - 1
|
198
|
+
i += mark_len(i) # skip any mark in front
|
199
|
+
Node.new(@s, i .. j) if j > i
|
200
|
+
end
|
196
201
|
end
|
197
202
|
|
198
203
|
def dot_is_not_sep?
|
@@ -44,7 +44,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
44
44
|
end
|
45
45
|
|
46
46
|
rule word
|
47
|
-
abbrev / char+
|
47
|
+
abbrev / word_complex / char+
|
48
48
|
end
|
49
49
|
|
50
50
|
rule and
|
@@ -104,13 +104,20 @@ module FoodIngredientParser::Strict::Grammar
|
|
104
104
|
'w.a'i /
|
105
105
|
'w.o'i /
|
106
106
|
'w.v'i /
|
107
|
-
#
|
108
|
-
'vit
|
109
|
-
'denat
|
110
|
-
'N°'i /
|
111
|
-
'°C'i
|
107
|
+
# not auto-generated additions
|
108
|
+
'vit'i /
|
109
|
+
'denat'i
|
112
110
|
)
|
113
111
|
'.'? ![[:alpha:]]
|
114
112
|
end
|
113
|
+
|
114
|
+
rule word_complex
|
115
|
+
# Complex words that contain characters that would otherwise be considered non-words.
|
116
|
+
(
|
117
|
+
'N°'i /
|
118
|
+
'°C'i /
|
119
|
+
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
|
120
|
+
) ![[:alpha:]]
|
121
|
+
end
|
115
122
|
end
|
116
123
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-09-
|
11
|
+
date: 2018-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -81,9 +81,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
81
81
|
version: '0'
|
82
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
83
|
requirements:
|
84
|
-
- - "
|
84
|
+
- - ">="
|
85
85
|
- !ruby/object:Gem::Version
|
86
|
-
version:
|
86
|
+
version: '0'
|
87
87
|
requirements: []
|
88
88
|
rubyforge_project:
|
89
89
|
rubygems_version: 2.6.13
|