food_ingredient_parser 1.1.6 → 1.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/food_ingredient_parser/cleaner.rb +8 -8
- data/lib/food_ingredient_parser/loose/scanner.rb +1 -1
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +11 -5
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +21 -4
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59825ee90990b2c4f52c9e59fae2e34e5b4558bc63a57fc59946db7f71335351
|
4
|
+
data.tar.gz: b8201945554a11fddbac8eb6676c4cbff8c1f5d3523780ec54125e3172fd05ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b97c863f9da5b26162883a3627809857fa9277e60e76f2c805312ceea34ad40317bcdf6ca2f1c56b2eafc4a5c637bbe0b5ac3c3c24ef8f874dd008bb5cc3bd7
|
7
|
+
data.tar.gz: bfcc88aea38c3db84670e84dd1166cbcdf8ce60edff297a2ed7324616ea53a4625ab137423f2f6701981a27a51b398ab652b207abe435328e4fa67974bad3f01
|
@@ -2,14 +2,14 @@ module FoodIngredientParser
|
|
2
2
|
module Cleaner
|
3
3
|
|
4
4
|
def self.clean(s)
|
5
|
-
s.gsub!(
|
6
|
-
s.gsub!("\
|
7
|
-
s.gsub!("
|
8
|
-
s.gsub!("
|
9
|
-
s.gsub!("
|
10
|
-
s.gsub!("
|
11
|
-
s.gsub!(
|
12
|
-
s.gsub!(/\A\s*'(.*)
|
5
|
+
s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
|
6
|
+
s.gsub!("\u00ad", "") # strip soft hyphen
|
7
|
+
s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
|
8
|
+
s.gsub!("‚", ",") # normalize unicode comma
|
9
|
+
s.gsub!("aÄs", "aïs") # encoding issue for maïs
|
10
|
+
s.gsub!("ï", "ï") # encoding issue
|
11
|
+
s.gsub!("ë", "ë") # encoding issue
|
12
|
+
s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
|
13
13
|
s
|
14
14
|
end
|
15
15
|
|
@@ -4,7 +4,7 @@ module FoodIngredientParser::Loose
|
|
4
4
|
class Scanner
|
5
5
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
|
-
MARK_CHARS = "
|
7
|
+
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
9
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
@@ -21,22 +21,28 @@ module FoodIngredientParser::Strict::Grammar
|
|
21
21
|
) ws* )?
|
22
22
|
amount_simple_quantity
|
23
23
|
( ws+ (
|
24
|
-
'of'i / 'or less of'i / 'or more of'i /
|
25
|
-
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
24
|
+
'of a'i / 'of'i / 'or less of'i / 'or more of'i /
|
25
|
+
'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
26
26
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
27
27
|
) )?
|
28
28
|
end
|
29
29
|
|
30
30
|
rule amount_simple_quantity
|
31
|
-
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
|
31
|
+
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
|
32
32
|
end
|
33
33
|
|
34
34
|
rule amount_simple_number
|
35
|
-
(
|
35
|
+
( amount_simple_comparator ws* )? number
|
36
|
+
end
|
37
|
+
|
38
|
+
rule amount_simple_comparator
|
39
|
+
'=' ws* [<>] /
|
40
|
+
[<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
|
41
|
+
[±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
|
36
42
|
end
|
37
43
|
|
38
44
|
rule amount_simple_unit
|
39
|
-
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
45
|
+
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
|
40
46
|
( ws 'vol'i ( !char / '.' ) )?
|
41
47
|
( ws* '℮' )?
|
42
48
|
end
|
@@ -23,9 +23,9 @@ module FoodIngredientParser::Strict::Grammar
|
|
23
23
|
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
|
24
24
|
'⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
|
25
25
|
[˄^] digit /
|
26
|
-
[
|
26
|
+
[†‡⁺•°▪◊#˄^~˛] /
|
27
27
|
'*'+ /
|
28
|
-
'(' ws* ( [
|
28
|
+
'(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
|
29
29
|
end
|
30
30
|
|
31
31
|
rule digit
|
@@ -33,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
33
33
|
end
|
34
34
|
|
35
35
|
rule fraction
|
36
|
-
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
|
36
|
+
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
|
37
|
+
digit+ '/' digit+
|
37
38
|
end
|
38
39
|
|
39
40
|
rule percent
|
@@ -61,6 +62,20 @@ module FoodIngredientParser::Strict::Grammar
|
|
61
62
|
![[:alnum:]] / ( ws* '(' 'i'i+ ')' ) # e.g. "E450 (iii)"
|
62
63
|
end
|
63
64
|
|
65
|
+
rule chem_systematic_name
|
66
|
+
( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
|
67
|
+
( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
|
68
|
+
end
|
69
|
+
|
70
|
+
rule chem_systematic_name_word
|
71
|
+
[A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
|
72
|
+
end
|
73
|
+
|
74
|
+
rule chem_systematic_name_num
|
75
|
+
digit+ [RH] /
|
76
|
+
digit+ ( ',' digit+ )* '\''?
|
77
|
+
end
|
78
|
+
|
64
79
|
rule abbrev
|
65
80
|
# These are listed explicitely to avoid incorrect interpretations, and allow missing trailing dots.
|
66
81
|
# To get an idea of what occurs (second one omits trailing dots):
|
@@ -115,6 +130,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
115
130
|
'w.o'i /
|
116
131
|
'w.v'i /
|
117
132
|
# not auto-generated additions
|
133
|
+
'nr.'i /
|
118
134
|
'vit'i / # vitamin
|
119
135
|
'denat'i / # denaturated
|
120
136
|
'alc'i / # alcohol
|
@@ -138,7 +154,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
138
154
|
'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
|
139
155
|
'nucifera' ws+ 'L.'i /
|
140
156
|
'type'i ws+ '"' [0-9]+ '"' /
|
141
|
-
e_number
|
157
|
+
e_number /
|
158
|
+
chem_systematic_name
|
142
159
|
) ![[:alpha:]]
|
143
160
|
end
|
144
161
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|