food_ingredient_parser 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/food_ingredient_parser/cleaner.rb +8 -8
- data/lib/food_ingredient_parser/loose/scanner.rb +1 -1
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +11 -5
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +21 -4
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 59825ee90990b2c4f52c9e59fae2e34e5b4558bc63a57fc59946db7f71335351
|
|
4
|
+
data.tar.gz: b8201945554a11fddbac8eb6676c4cbff8c1f5d3523780ec54125e3172fd05ac
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3b97c863f9da5b26162883a3627809857fa9277e60e76f2c805312ceea34ad40317bcdf6ca2f1c56b2eafc4a5c637bbe0b5ac3c3c24ef8f874dd008bb5cc3bd7
|
|
7
|
+
data.tar.gz: bfcc88aea38c3db84670e84dd1166cbcdf8ce60edff297a2ed7324616ea53a4625ab137423f2f6701981a27a51b398ab652b207abe435328e4fa67974bad3f01
|
|
@@ -2,14 +2,14 @@ module FoodIngredientParser
|
|
|
2
2
|
module Cleaner
|
|
3
3
|
|
|
4
4
|
def self.clean(s)
|
|
5
|
-
s.gsub!(
|
|
6
|
-
s.gsub!("\
|
|
7
|
-
s.gsub!("
|
|
8
|
-
s.gsub!("
|
|
9
|
-
s.gsub!("
|
|
10
|
-
s.gsub!("
|
|
11
|
-
s.gsub!(
|
|
12
|
-
s.gsub!(/\A\s*'(.*)
|
|
5
|
+
s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
|
|
6
|
+
s.gsub!("\u00ad", "") # strip soft hyphen
|
|
7
|
+
s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
|
|
8
|
+
s.gsub!("‚", ",") # normalize unicode comma
|
|
9
|
+
s.gsub!("aÄs", "aïs") # encoding issue for maïs
|
|
10
|
+
s.gsub!("ï", "ï") # encoding issue
|
|
11
|
+
s.gsub!("ë", "ë") # encoding issue
|
|
12
|
+
s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
|
|
13
13
|
s
|
|
14
14
|
end
|
|
15
15
|
|
|
@@ -4,7 +4,7 @@ module FoodIngredientParser::Loose
|
|
|
4
4
|
class Scanner
|
|
5
5
|
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
|
7
|
-
MARK_CHARS = "
|
|
7
|
+
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
|
9
9
|
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
|
@@ -21,22 +21,28 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
21
21
|
) ws* )?
|
|
22
22
|
amount_simple_quantity
|
|
23
23
|
( ws+ (
|
|
24
|
-
'of'i / 'or less of'i / 'or more of'i /
|
|
25
|
-
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
|
24
|
+
'of a'i / 'of'i / 'or less of'i / 'or more of'i /
|
|
25
|
+
'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
|
26
26
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
|
27
27
|
) )?
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
rule amount_simple_quantity
|
|
31
|
-
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
|
|
31
|
+
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
|
|
32
32
|
end
|
|
33
33
|
|
|
34
34
|
rule amount_simple_number
|
|
35
|
-
(
|
|
35
|
+
( amount_simple_comparator ws* )? number
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
rule amount_simple_comparator
|
|
39
|
+
'=' ws* [<>] /
|
|
40
|
+
[<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
|
|
41
|
+
[±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
|
|
36
42
|
end
|
|
37
43
|
|
|
38
44
|
rule amount_simple_unit
|
|
39
|
-
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
|
45
|
+
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
|
|
40
46
|
( ws 'vol'i ( !char / '.' ) )?
|
|
41
47
|
( ws* '℮' )?
|
|
42
48
|
end
|
|
@@ -23,9 +23,9 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
23
23
|
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
|
|
24
24
|
'⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
|
|
25
25
|
[˄^] digit /
|
|
26
|
-
[
|
|
26
|
+
[†‡⁺•°▪◊#˄^~˛] /
|
|
27
27
|
'*'+ /
|
|
28
|
-
'(' ws* ( [
|
|
28
|
+
'(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
rule digit
|
|
@@ -33,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
rule fraction
|
|
36
|
-
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
|
|
36
|
+
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
|
|
37
|
+
digit+ '/' digit+
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
rule percent
|
|
@@ -61,6 +62,20 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
61
62
|
![[:alnum:]] / ( ws* '(' 'i'i+ ')' ) # e.g. "E450 (iii)"
|
|
62
63
|
end
|
|
63
64
|
|
|
65
|
+
rule chem_systematic_name
|
|
66
|
+
( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
|
|
67
|
+
( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
rule chem_systematic_name_word
|
|
71
|
+
[A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
rule chem_systematic_name_num
|
|
75
|
+
digit+ [RH] /
|
|
76
|
+
digit+ ( ',' digit+ )* '\''?
|
|
77
|
+
end
|
|
78
|
+
|
|
64
79
|
rule abbrev
|
|
65
80
|
# These are listed explicitely to avoid incorrect interpretations, and allow missing trailing dots.
|
|
66
81
|
# To get an idea of what occurs (second one omits trailing dots):
|
|
@@ -115,6 +130,7 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
115
130
|
'w.o'i /
|
|
116
131
|
'w.v'i /
|
|
117
132
|
# not auto-generated additions
|
|
133
|
+
'nr.'i /
|
|
118
134
|
'vit'i / # vitamin
|
|
119
135
|
'denat'i / # denaturated
|
|
120
136
|
'alc'i / # alcohol
|
|
@@ -138,7 +154,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
|
138
154
|
'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
|
|
139
155
|
'nucifera' ws+ 'L.'i /
|
|
140
156
|
'type'i ws+ '"' [0-9]+ '"' /
|
|
141
|
-
e_number
|
|
157
|
+
e_number /
|
|
158
|
+
chem_systematic_name
|
|
142
159
|
) ![[:alpha:]]
|
|
143
160
|
end
|
|
144
161
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: food_ingredient_parser
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- wvengen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-
|
|
11
|
+
date: 2020-12-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: treetop
|