food_ingredient_parser 1.1.3 → 1.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/food_ingredient_parser +7 -2
- data/lib/food_ingredient_parser/cleaner.rb +8 -7
- data/lib/food_ingredient_parser/loose/scanner.rb +9 -5
- data/lib/food_ingredient_parser/loose/transform/amount_from_name.treetop +3 -2
- data/lib/food_ingredient_parser/strict/grammar/amount.treetop +16 -5
- data/lib/food_ingredient_parser/strict/grammar/common.treetop +40 -9
- data/lib/food_ingredient_parser/strict/grammar/ingredient_simple.treetop +3 -2
- data/lib/food_ingredient_parser/strict/grammar/root.treetop +1 -1
- data/lib/food_ingredient_parser/version.rb +2 -2
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a509e86303484d262ce6fd2d52b4db47d661e45ccbb7691e479e5257bd6c17a2
|
4
|
+
data.tar.gz: a1f50b40110cc4ba87e54ee410a3016d89796277882ceec3696a1e70d081c5f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62424433ad156e03a9448e5250459bb9409e8111a41fd9e02db823fd2e5f26cee9954476c0d177eb0375fa5405dd1c4a968d652ae63ff9abf97f3599fc950a4f
|
7
|
+
data.tar.gz: 06a05663e658643253184589f621c4af2d12c51bf8825429d56480be187fb2ff2d07c956e5d8520e4cce377b1894c23d52dab7ba0644e1614276e7b3e87e0ca6
|
data/bin/food_ingredient_parser
CHANGED
@@ -41,8 +41,10 @@ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false
|
|
41
41
|
if parsed
|
42
42
|
puts(parsed.inspect) if verbosity > 1
|
43
43
|
pp(parsed.to_h, color: color) if verbosity > 0
|
44
|
+
return true
|
44
45
|
else
|
45
46
|
puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
|
47
|
+
return false
|
46
48
|
end
|
47
49
|
end
|
48
50
|
|
@@ -63,6 +65,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
|
|
63
65
|
pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
|
64
66
|
pct_noresult = 100.0 * count_noresult / (count_parsed + count_noresult)
|
65
67
|
puts "parsed #{colorize(color && "1;32", count_parsed)} (#{pct_parsed.round(1)}%), no result #{colorize(color && "1;31", count_noresult)} (#{pct_noresult.round(1)}%)"
|
68
|
+
return count_noresult
|
66
69
|
end
|
67
70
|
|
68
71
|
verbosity = 1
|
@@ -108,8 +111,10 @@ if strings.any? || files.any?
|
|
108
111
|
STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
|
109
112
|
exit(1)
|
110
113
|
end
|
111
|
-
|
112
|
-
|
114
|
+
success = true
|
115
|
+
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
|
116
|
+
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
|
117
|
+
success or exit(1)
|
113
118
|
else
|
114
119
|
STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
|
115
120
|
end
|
@@ -2,13 +2,14 @@ module FoodIngredientParser
|
|
2
2
|
module Cleaner
|
3
3
|
|
4
4
|
def self.clean(s)
|
5
|
-
s.gsub!(
|
6
|
-
s.gsub!("\
|
7
|
-
s.gsub!("
|
8
|
-
s.gsub!("
|
9
|
-
s.gsub!("
|
10
|
-
s.gsub!(
|
11
|
-
s.gsub!(
|
5
|
+
s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
|
6
|
+
s.gsub!("\u00ad", "") # strip soft hyphen
|
7
|
+
s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
|
8
|
+
s.gsub!("‚", ",") # normalize unicode comma
|
9
|
+
s.gsub!("aÄs", "aïs") # encoding issue for maïs
|
10
|
+
s.gsub!("ï", "ï") # encoding issue
|
11
|
+
s.gsub!("ë", "ë") # encoding issue
|
12
|
+
s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
|
12
13
|
s
|
13
14
|
end
|
14
15
|
|
@@ -4,19 +4,23 @@ module FoodIngredientParser::Loose
|
|
4
4
|
class Scanner
|
5
5
|
|
6
6
|
SEP_CHARS = "|;,.".freeze
|
7
|
-
MARK_CHARS = "
|
7
|
+
MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
|
8
8
|
PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
|
9
|
-
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
9
|
+
NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
|
10
10
|
# Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
|
11
11
|
ABBREV_RE = Regexp.union(
|
12
12
|
/\A(
|
13
13
|
N°\b |
|
14
14
|
°C\b |
|
15
15
|
(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
|
16
|
-
L\(\+\)-[[:alnum:]]+\b |
|
16
|
+
L\(\+\)[ -][[:alnum:]]+\b |
|
17
|
+
L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae |
|
18
|
+
S\.\s+thermophilus\b | L\.\sbulgaricus\b |
|
19
|
+
T\.\s*aestivum\b(\s+vitt\.)? |
|
20
|
+
nucifera\s+L\. |
|
17
21
|
type\s+"\d+" |
|
18
|
-
|
19
|
-
|
22
|
+
E-e?\d{3}[a-z]?\s*\(i+\) |
|
23
|
+
www\.[-_\/:%.A-Za-z0-9]+
|
20
24
|
)/xi,
|
21
25
|
*%w[
|
22
26
|
a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
|
@@ -5,8 +5,9 @@ module FoodIngredientParser::Loose::Transform
|
|
5
5
|
|
6
6
|
rule amount_from_name
|
7
7
|
# just amount, amount in front or at the end
|
8
|
-
ws* amount:amount
|
9
|
-
ws* amount:
|
8
|
+
ws* amount:amount ws+ name:(.*) /
|
9
|
+
ws* amount:amount_simple_percent ws* name:(.*) /
|
10
|
+
ws* amount:amount ws* /
|
10
11
|
ws* name:( !amount word ( ws+ !amount word )* )+ ws* amount:amount ws*
|
11
12
|
end
|
12
13
|
end
|
@@ -9,6 +9,10 @@ module FoodIngredientParser::Strict::Grammar
|
|
9
9
|
amount:amount_simple <AmountNode>
|
10
10
|
end
|
11
11
|
|
12
|
+
rule amount_simple_percent
|
13
|
+
amount:(amount_simple_number ws* percent) <AmountNode>
|
14
|
+
end
|
15
|
+
|
12
16
|
rule amount_simple
|
13
17
|
( (
|
14
18
|
'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
|
@@ -17,23 +21,30 @@ module FoodIngredientParser::Strict::Grammar
|
|
17
21
|
) ws* )?
|
18
22
|
amount_simple_quantity
|
19
23
|
( ws+ (
|
20
|
-
'of'i / 'or less of'i / 'or more of'i /
|
21
|
-
'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
24
|
+
'of a'i / 'of'i / 'or less of'i / 'or more of'i /
|
25
|
+
'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
|
22
26
|
'min.'i / 'min'i / 'max.'i / 'max'i
|
23
27
|
) )?
|
24
28
|
end
|
25
29
|
|
26
30
|
rule amount_simple_quantity
|
27
|
-
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
|
31
|
+
amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
|
28
32
|
end
|
29
33
|
|
30
34
|
rule amount_simple_number
|
31
|
-
(
|
35
|
+
( amount_simple_comparator ws* )? number
|
36
|
+
end
|
37
|
+
|
38
|
+
rule amount_simple_comparator
|
39
|
+
'=' ws* [<>] /
|
40
|
+
[<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
|
41
|
+
[±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
|
32
42
|
end
|
33
43
|
|
34
44
|
rule amount_simple_unit
|
35
|
-
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
|
45
|
+
( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
|
36
46
|
( ws 'vol'i ( !char / '.' ) )?
|
47
|
+
( ws* '℮' )?
|
37
48
|
end
|
38
49
|
end
|
39
50
|
end
|
@@ -10,17 +10,22 @@ module FoodIngredientParser::Strict::Grammar
|
|
10
10
|
end
|
11
11
|
|
12
12
|
rule char
|
13
|
-
[[:alnum:]] /
|
13
|
+
!mark [[:alnum:]] /
|
14
14
|
fraction /
|
15
|
-
[-/\`'
|
16
|
-
[
|
17
|
-
[
|
15
|
+
[-/\`'"´‘’+=_{}&] /
|
16
|
+
[®©™♣] /
|
17
|
+
[¿?¯] / # weird characters turning up in names (e.g. encoding issues)
|
18
18
|
[₁₂₃₄₅₆₇₈₉] # can occur with vitamins
|
19
19
|
end
|
20
20
|
|
21
21
|
rule mark
|
22
22
|
# mark referencing a footnote
|
23
|
-
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
|
23
|
+
[¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
|
24
|
+
'⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
|
25
|
+
[˄^] digit /
|
26
|
+
[†‡⁺•°▪◊#˄^~˛] /
|
27
|
+
'*'+ /
|
28
|
+
'(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
|
24
29
|
end
|
25
30
|
|
26
31
|
rule digit
|
@@ -28,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
|
|
28
33
|
end
|
29
34
|
|
30
35
|
rule fraction
|
31
|
-
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
|
36
|
+
[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
|
37
|
+
digit+ '/' digit+
|
32
38
|
end
|
33
39
|
|
34
40
|
rule percent
|
@@ -51,6 +57,25 @@ module FoodIngredientParser::Strict::Grammar
|
|
51
57
|
( 'and' / 'en' / 'und' ) !char / '&'
|
52
58
|
end
|
53
59
|
|
60
|
+
rule e_number
|
61
|
+
( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
|
62
|
+
( ( ws* '(' 'i'i+ ')' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
|
63
|
+
end
|
64
|
+
|
65
|
+
rule chem_systematic_name
|
66
|
+
( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
|
67
|
+
( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
|
68
|
+
end
|
69
|
+
|
70
|
+
rule chem_systematic_name_word
|
71
|
+
[A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
|
72
|
+
end
|
73
|
+
|
74
|
+
rule chem_systematic_name_num
|
75
|
+
digit+ [RH] /
|
76
|
+
digit+ ( ',' digit+ )* '\''?
|
77
|
+
end
|
78
|
+
|
54
79
|
rule abbrev
|
55
80
|
# These are listed explicitely to avoid incorrect interpretations, and allow missing trailing dots.
|
56
81
|
# To get an idea of what occurs (second one omits trailing dots):
|
@@ -105,12 +130,14 @@ module FoodIngredientParser::Strict::Grammar
|
|
105
130
|
'w.o'i /
|
106
131
|
'w.v'i /
|
107
132
|
# not auto-generated additions
|
133
|
+
'nr.'i /
|
108
134
|
'vit'i / # vitamin
|
109
135
|
'denat'i / # denaturated
|
110
136
|
'alc'i / # alcohol
|
111
137
|
'vol'i / # volume
|
112
138
|
'conc'i / # concentration
|
113
|
-
'subsp'i
|
139
|
+
'subsp'i / # subspecies
|
140
|
+
'www.'i [-_\/:%.A-Za-z0-9]+
|
114
141
|
)
|
115
142
|
'.'? ![[:alpha:]]
|
116
143
|
end
|
@@ -121,10 +148,14 @@ module FoodIngredientParser::Strict::Grammar
|
|
121
148
|
'N°'i /
|
122
149
|
'°C'i /
|
123
150
|
( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
|
124
|
-
'L(+)-' [[:alnum:]]+ /
|
151
|
+
'L(+)' ('-' / ws) [[:alnum:]]+ /
|
125
152
|
'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
|
153
|
+
'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i /
|
154
|
+
'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
|
155
|
+
'nucifera' ws+ 'L.'i /
|
126
156
|
'type'i ws+ '"' [0-9]+ '"' /
|
127
|
-
|
157
|
+
e_number /
|
158
|
+
chem_systematic_name
|
128
159
|
) ![[:alpha:]]
|
129
160
|
end
|
130
161
|
end
|
@@ -9,13 +9,14 @@ module FoodIngredientParser::Strict::Grammar
|
|
9
9
|
end
|
10
10
|
|
11
11
|
rule ingredient_simple_with_amount
|
12
|
-
pre:( '{' ws* )? amount:amount
|
12
|
+
pre:( '{' ws* )? amount:amount ws+ ing:ingredient_simple <IngredientNode> /
|
13
|
+
pre:( '{' ws* )? amount:amount_simple_percent ws* ing:ingredient_simple <IngredientNode> /
|
13
14
|
ing:ingredient_simple ws* amount:amount post:( ws* '}' )? (ws? mark:mark)? <IngredientNode> /
|
14
15
|
ing:ingredient_simple <IngredientNode>
|
15
16
|
end
|
16
17
|
|
17
18
|
rule ingredient_simple_e_number
|
18
|
-
name:
|
19
|
+
name:e_number <IngredientNode>
|
19
20
|
end
|
20
21
|
|
21
22
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_ingredient_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -87,8 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
|
-
|
91
|
-
rubygems_version: 2.6.13
|
90
|
+
rubygems_version: 3.0.3
|
92
91
|
signing_key:
|
93
92
|
specification_version: 4
|
94
93
|
summary: Parser for ingredient lists found on food products.
|