food_ingredient_parser 1.1.2 → 1.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 64ad7a10a1480b520602113bbcdfc10ba1daf8b5
4
- data.tar.gz: 4068a9edbe1dca908228f38d2795ad63a5cbcf76
2
+ SHA256:
3
+ metadata.gz: 59825ee90990b2c4f52c9e59fae2e34e5b4558bc63a57fc59946db7f71335351
4
+ data.tar.gz: b8201945554a11fddbac8eb6676c4cbff8c1f5d3523780ec54125e3172fd05ac
5
5
  SHA512:
6
- metadata.gz: 73ce876757b08e1d2cf0b5126e8d024b3728260134c3c4f3fe49fee14793da77ecc48d286165dc0c86e8363f2eddf6081355ac26d38f524371b367f4aa3cee23
7
- data.tar.gz: befa97dc0fd4605cd2019a2cf7a39aa15d5dfa15acf118f10e9b28e104bbc9bd5925f28286e780acb142eb73a935dba26657f1c6fe9cb766e7f4f88310d5ce55
6
+ metadata.gz: 3b97c863f9da5b26162883a3627809857fa9277e60e76f2c805312ceea34ad40317bcdf6ca2f1c56b2eafc4a5c637bbe0b5ac3c3c24ef8f874dd008bb5cc3bd7
7
+ data.tar.gz: bfcc88aea38c3db84670e84dd1166cbcdf8ce60edff297a2ed7324616ea53a4625ab137423f2f6701981a27a51b398ab652b207abe435328e4fa67974bad3f01
data/README.md CHANGED
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
185
185
  to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
186
186
  upgrade to `1.1`.
187
187
 
188
+ ## Languages
189
+
190
+ While most of the parsing is language-independent, some parts need knowledge about certain words
191
+ (like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
192
+ plus a bit of English and German. Support for other languages is already good, but lacks in certain
193
+ areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
+
195
+ Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
+ parse less well (that that's a matter of tine-tuning).
197
+
188
198
  ## Test data
189
199
 
190
- [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
200
+ [`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
191
201
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
192
202
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
193
203
  The strict parser currently parses 80%, while the loose parser returns something for all of them.
204
+
205
+ ## License
206
+
207
+ This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
@@ -41,8 +41,10 @@ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false
41
41
  if parsed
42
42
  puts(parsed.inspect) if verbosity > 1
43
43
  pp(parsed.to_h, color: color) if verbosity > 0
44
+ return true
44
45
  else
45
46
  puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
47
+ return false
46
48
  end
47
49
  end
48
50
 
@@ -63,6 +65,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
63
65
  pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
64
66
  pct_noresult = 100.0 * count_noresult / (count_parsed + count_noresult)
65
67
  puts "parsed #{colorize(color && "1;32", count_parsed)} (#{pct_parsed.round(1)}%), no result #{colorize(color && "1;31", count_noresult)} (#{pct_noresult.round(1)}%)"
68
+ return count_noresult
66
69
  end
67
70
 
68
71
  verbosity = 1
@@ -108,8 +111,10 @@ if strings.any? || files.any?
108
111
  STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
109
112
  exit(1)
110
113
  end
111
- strings.each {|s| parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
112
- files.each {|f| parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
114
+ success = true
115
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
116
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
117
+ success or exit(1)
113
118
  else
114
119
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
115
120
  end
@@ -2,13 +2,14 @@ module FoodIngredientParser
2
2
  module Cleaner
3
3
 
4
4
  def self.clean(s)
5
- s.gsub!("\u00ad", "") # strip soft hyphen
6
- s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
7
- s.gsub!("aÄs", "aïs") # encoding issue for maïs
8
- s.gsub!("ï", "ï") # encoding issue
9
- s.gsub!("ë", "ë") # encoding issue
10
- s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
11
- s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
5
+ s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
6
+ s.gsub!("\u00ad", "") # strip soft hyphen
7
+ s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
8
+ s.gsub!("", ",") # normalize unicode comma
9
+ s.gsub!("aÄs", "aïs") # encoding issue for maïs
10
+ s.gsub!("ï", "ï") # encoding issue
11
+ s.gsub!("ë", "ë") # encoding issue
12
+ s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
12
13
  s
13
14
  end
14
15
 
@@ -4,16 +4,31 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
- MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
7
+ MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
8
  PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
- NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
9
+ NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
10
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
- ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
12
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
13
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
14
- p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
15
- min max ca
16
- ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
11
+ ABBREV_RE = Regexp.union(
12
+ /\A(
13
+ N°\b |
14
+ °C\b |
15
+ (ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
16
+ L\(\+\)[ -][[:alnum:]]+\b |
17
+ L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae |
18
+ S\.\s+thermophilus\b | L\.\sbulgaricus\b |
19
+ T\.\s*aestivum\b(\s+vitt\.)? |
20
+ nucifera\s+L\. |
21
+ type\s+"\d+" |
22
+ E-e?\d{3}[a-z]?\s*\(i+\) |
23
+ www\.[-_\/:%.A-Za-z0-9]+
24
+ )/xi,
25
+ *%w[
26
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
+ i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
+ p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
+ min max ca
30
+ ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
31
+ ).freeze
17
32
 
18
33
  def initialize(s, index: 0)
19
34
  @s = s # input string
@@ -5,8 +5,9 @@ module FoodIngredientParser::Loose::Transform
5
5
 
6
6
  rule amount_from_name
7
7
  # just amount, amount in front or at the end
8
- ws* amount:amount ws+ name:(.*) /
9
- ws* amount:amount ws* /
8
+ ws* amount:amount ws+ name:(.*) /
9
+ ws* amount:amount_simple_percent ws* name:(.*) /
10
+ ws* amount:amount ws* /
10
11
  ws* name:( !amount word ( ws+ !amount word )* )+ ws* amount:amount ws*
11
12
  end
12
13
  end
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- MATCH_RE = /\A\s*(e[0-9]{3}[a-z]?)(?:#{SPLIT_RE}(e[0-9]{3}[a-z]?))+\s*\z/i.freeze
10
+ SPLIT_RE = /\s*-\s*/.freeze
11
+ SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
12
+ MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
12
13
 
13
14
  def self.transform!(node)
14
15
  new(node).transform!
@@ -9,6 +9,10 @@ module FoodIngredientParser::Strict::Grammar
9
9
  amount:amount_simple <AmountNode>
10
10
  end
11
11
 
12
+ rule amount_simple_percent
13
+ amount:(amount_simple_number ws* percent) <AmountNode>
14
+ end
15
+
12
16
  rule amount_simple
13
17
  ( (
14
18
  'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
@@ -17,21 +21,30 @@ module FoodIngredientParser::Strict::Grammar
17
21
  ) ws* )?
18
22
  amount_simple_quantity
19
23
  ( ws+ (
20
- 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
24
+ 'of a'i / 'of'i / 'or less of'i / 'or more of'i /
25
+ 'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
21
26
  'min.'i / 'min'i / 'max.'i / 'max'i
22
27
  ) )?
23
28
  end
24
29
 
25
30
  rule amount_simple_quantity
26
- amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
31
+ amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
27
32
  end
28
33
 
29
34
  rule amount_simple_number
30
- ( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
35
+ ( amount_simple_comparator ws* )? number
36
+ end
37
+
38
+ rule amount_simple_comparator
39
+ '=' ws* [<>] /
40
+ [<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
41
+ [±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
31
42
  end
32
43
 
33
44
  rule amount_simple_unit
34
- ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
45
+ ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
46
+ ( ws 'vol'i ( !char / '.' ) )?
47
+ ( ws* '℮' )?
35
48
  end
36
49
  end
37
50
  end
@@ -10,17 +10,22 @@ module FoodIngredientParser::Strict::Grammar
10
10
  end
11
11
 
12
12
  rule char
13
- [[:alnum:]] /
13
+ !mark [[:alnum:]] /
14
14
  fraction /
15
- [-/\`'´’+=_{}&] /
16
- [®™] /
17
- [¿?] / # weird characters turning up in names (e.g. encoding issues)
15
+ [-/\`'"´‘’+=_{}&] /
16
+ [®©™♣] /
17
+ [¿?¯] / # weird characters turning up in names (e.g. encoding issues)
18
18
  [₁₂₃₄₅₆₇₈₉] # can occur with vitamins
19
19
  end
20
20
 
21
21
  rule mark
22
22
  # mark referencing a footnote
23
- [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [†‡•°#^] / '*'+ / '(' ws* ( [†‡•°#^] / '*'+ ) ws* ')'
23
+ [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
24
+ '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
25
+ [˄^] digit /
26
+ [†‡⁺•°▪◊#˄^~˛] /
27
+ '*'+ /
28
+ '(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
24
29
  end
25
30
 
26
31
  rule digit
@@ -28,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
28
33
  end
29
34
 
30
35
  rule fraction
31
- [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
36
+ [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
37
+ digit+ '/' digit+
32
38
  end
33
39
 
34
40
  rule percent
@@ -48,7 +54,26 @@ module FoodIngredientParser::Strict::Grammar
48
54
  end
49
55
 
50
56
  rule and
51
- ( 'and' / 'en' / 'und' / '&' ) !char
57
+ ( 'and' / 'en' / 'und' ) !char / '&'
58
+ end
59
+
60
+ rule e_number
61
+ ( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
+ ![[:alnum:]] / ( ws* '(' 'i'i+ ')' ) # e.g. "E450 (iii)"
63
+ end
64
+
65
+ rule chem_systematic_name
66
+ ( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
67
+ ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
68
+ end
69
+
70
+ rule chem_systematic_name_word
71
+ [A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
72
+ end
73
+
74
+ rule chem_systematic_name_num
75
+ digit+ [RH] /
76
+ digit+ ( ',' digit+ )* '\''?
52
77
  end
53
78
 
54
79
  rule abbrev
@@ -105,8 +130,14 @@ module FoodIngredientParser::Strict::Grammar
105
130
  'w.o'i /
106
131
  'w.v'i /
107
132
  # not auto-generated additions
108
- 'vit'i /
109
- 'denat'i
133
+ 'nr.'i /
134
+ 'vit'i / # vitamin
135
+ 'denat'i / # denaturated
136
+ 'alc'i / # alcohol
137
+ 'vol'i / # volume
138
+ 'conc'i / # concentration
139
+ 'subsp'i / # subspecies
140
+ 'www.'i [-_\/:%.A-Za-z0-9]+
110
141
  )
111
142
  '.'? ![[:alpha:]]
112
143
  end
@@ -116,7 +147,15 @@ module FoodIngredientParser::Strict::Grammar
116
147
  (
117
148
  'N°'i /
118
149
  '°C'i /
119
- ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
150
+ ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
151
+ 'L(+)' ('-' / ws) [[:alnum:]]+ /
152
+ 'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
153
+ 'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i /
154
+ 'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
155
+ 'nucifera' ws+ 'L.'i /
156
+ 'type'i ws+ '"' [0-9]+ '"' /
157
+ e_number /
158
+ chem_systematic_name
120
159
  ) ![[:alpha:]]
121
160
  end
122
161
  end
@@ -21,7 +21,7 @@ module FoodIngredientParser::Strict::Grammar
21
21
 
22
22
  rule ingredient_nested_contains
23
23
  'contains'i /
24
- 'bevat'i
24
+ 'bevat'i / 'bevat o.a.'i / 'o.a.'i / 'met'i
25
25
  end
26
26
 
27
27
  end
@@ -9,13 +9,14 @@ module FoodIngredientParser::Strict::Grammar
9
9
  end
10
10
 
11
11
  rule ingredient_simple_with_amount
12
- pre:( '{' ws* )? amount:amount ws+ ing:ingredient_simple <IngredientNode> /
12
+ pre:( '{' ws* )? amount:amount ws+ ing:ingredient_simple <IngredientNode> /
13
+ pre:( '{' ws* )? amount:amount_simple_percent ws* ing:ingredient_simple <IngredientNode> /
13
14
  ing:ingredient_simple ws* amount:amount post:( ws* '}' )? (ws? mark:mark)? <IngredientNode> /
14
15
  ing:ingredient_simple <IngredientNode>
15
16
  end
16
17
 
17
18
  rule ingredient_simple_e_number
18
- name:( [Ee] [0-9] [0-9] [0-9] [a-zA-Z]? ) ![a-zA-Z0-9] <IngredientNode>
19
+ name:e_number <IngredientNode>
19
20
  end
20
21
 
21
22
  end
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
8
10
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
9
11
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
10
14
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
11
15
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
12
16
  contains:( ws* list_coloned_ingredient ) <ListNode>
@@ -40,7 +40,7 @@ module FoodIngredientParser::Strict::Grammar
40
40
  end
41
41
 
42
42
  rule root_mark_sentences_in_list
43
- ( ( ws* [,.;] / ws ) ws* root_mark_sentence_in_list )+
43
+ ( ( ws* [,.;] / ws )+ root_mark_sentence_in_list )+
44
44
  end
45
45
 
46
46
  rule root_mark_sentence_in_list
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.2'
3
- VERSION_DATE = '2018-09-28'
2
+ VERSION = '1.1.7'
3
+ VERSION_DATE = '2020-12-28'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-28 00:00:00.000000000 Z
11
+ date: 2020-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,8 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubyforge_project:
91
- rubygems_version: 2.6.13
90
+ rubygems_version: 3.0.3
92
91
  signing_key:
93
92
  specification_version: 4
94
93
  summary: Parser for ingredient lists found on food products.