food_ingredient_parser 1.1.2 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 64ad7a10a1480b520602113bbcdfc10ba1daf8b5
4
- data.tar.gz: 4068a9edbe1dca908228f38d2795ad63a5cbcf76
2
+ SHA256:
3
+ metadata.gz: 59825ee90990b2c4f52c9e59fae2e34e5b4558bc63a57fc59946db7f71335351
4
+ data.tar.gz: b8201945554a11fddbac8eb6676c4cbff8c1f5d3523780ec54125e3172fd05ac
5
5
  SHA512:
6
- metadata.gz: 73ce876757b08e1d2cf0b5126e8d024b3728260134c3c4f3fe49fee14793da77ecc48d286165dc0c86e8363f2eddf6081355ac26d38f524371b367f4aa3cee23
7
- data.tar.gz: befa97dc0fd4605cd2019a2cf7a39aa15d5dfa15acf118f10e9b28e104bbc9bd5925f28286e780acb142eb73a935dba26657f1c6fe9cb766e7f4f88310d5ce55
6
+ metadata.gz: 3b97c863f9da5b26162883a3627809857fa9277e60e76f2c805312ceea34ad40317bcdf6ca2f1c56b2eafc4a5c637bbe0b5ac3c3c24ef8f874dd008bb5cc3bd7
7
+ data.tar.gz: bfcc88aea38c3db84670e84dd1166cbcdf8ce60edff297a2ed7324616ea53a4625ab137423f2f6701981a27a51b398ab652b207abe435328e4fa67974bad3f01
data/README.md CHANGED
@@ -185,9 +185,23 @@ So if you only use the stable interface (`parse`, `to_h` and `to_html`), you can
185
185
  to e.g. `~> 1.0`. If you depend on more, lock your version against e.g. `~> 1.0.0` and test when you
186
186
  upgrade to `1.1`.
187
187
 
188
+ ## Languages
189
+
190
+ While most of the parsing is language-independent, some parts need knowledge about certain words
191
+ (like abbreviations and amount specifiers). The gem was developed with ingredient lists in Dutch (nl),
192
+ plus a bit of English and German. Support for other languages is already good, but lacks in certain
193
+ areas: improvements are welcome (starting with a corpus in [data/](data/)).
194
+
195
+ Many ingredient lists from the USA are structured a bit differently than those from Europe, they
196
+ parse less well (that that's a matter of tine-tuning).
197
+
188
198
  ## Test data
189
199
 
190
- [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
200
+ [`data/ingredient-samples-qm-nl`](data/ingredient-samples-qm-nl) contains about 150k
191
201
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
192
202
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
193
203
  The strict parser currently parses 80%, while the loose parser returns something for all of them.
204
+
205
+ ## License
206
+
207
+ This software is distributed under the [MIT license](LICENSE). Data may have a [different license](data/README.md).
@@ -41,8 +41,10 @@ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false
41
41
  if parsed
42
42
  puts(parsed.inspect) if verbosity > 1
43
43
  pp(parsed.to_h, color: color) if verbosity > 0
44
+ return true
44
45
  else
45
46
  puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
47
+ return false
46
48
  end
47
49
  end
48
50
 
@@ -63,6 +65,7 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
63
65
  pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
64
66
  pct_noresult = 100.0 * count_noresult / (count_parsed + count_noresult)
65
67
  puts "parsed #{colorize(color && "1;32", count_parsed)} (#{pct_parsed.round(1)}%), no result #{colorize(color && "1;31", count_noresult)} (#{pct_noresult.round(1)}%)"
68
+ return count_noresult
66
69
  end
67
70
 
68
71
  verbosity = 1
@@ -108,8 +111,10 @@ if strings.any? || files.any?
108
111
  STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
109
112
  exit(1)
110
113
  end
111
- strings.each {|s| parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
112
- files.each {|f| parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
114
+ success = true
115
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
116
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) == 0 }
117
+ success or exit(1)
113
118
  else
114
119
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
115
120
  end
@@ -2,13 +2,14 @@ module FoodIngredientParser
2
2
  module Cleaner
3
3
 
4
4
  def self.clean(s)
5
- s.gsub!("\u00ad", "") # strip soft hyphen
6
- s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
7
- s.gsub!("aÄs", "aïs") # encoding issue for maïs
8
- s.gsub!("ï", "ï") # encoding issue
9
- s.gsub!("ë", "ë") # encoding issue
10
- s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
11
- s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
5
+ s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
6
+ s.gsub!("\u00ad", "") # strip soft hyphen
7
+ s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
8
+ s.gsub!("", ",") # normalize unicode comma
9
+ s.gsub!("aÄs", "aïs") # encoding issue for maïs
10
+ s.gsub!("ï", "ï") # encoding issue
11
+ s.gsub!("ë", "ë") # encoding issue
12
+ s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
12
13
  s
13
14
  end
14
15
 
@@ -4,16 +4,31 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
- MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
7
+ MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
8
  PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
- NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
9
+ NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
10
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
11
- ABBREV_RE = Regexp.union(/\A(N°|°C|(ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+)\b/i, *%w[
12
- a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
13
- i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
14
- p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat
15
- min max ca
16
- ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}).freeze
11
+ ABBREV_RE = Regexp.union(
12
+ /\A(
13
+ N°\b |
14
+ °C\b |
15
+ (ijzer|chroom|koper)\s*\(I+\)\s*[[:alnum:]]+\b |
16
+ L\(\+\)[ -][[:alnum:]]+\b |
17
+ L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae |
18
+ S\.\s+thermophilus\b | L\.\sbulgaricus\b |
19
+ T\.\s*aestivum\b(\s+vitt\.)? |
20
+ nucifera\s+L\. |
21
+ type\s+"\d+" |
22
+ E-e?\d{3}[a-z]?\s*\(i+\) |
23
+ www\.[-_\/:%.A-Za-z0-9]+
24
+ )/xi,
25
+ *%w[
26
+ a.o.p b.g.a b.o.b c.a c.i d.e d.m.v d.o.c d.o.p d.s e.a e.g e.u f.i.l f.o.s i.a
27
+ i.d i.e i.g.m.e i.g.p i.m.v i.o i.v.m l.s.l n.a n.b n.o n.v.t o.a o.b.v p.d.o
28
+ p.g.i q.s s.l s.s t.o.v u.h.t v.g v.s w.a w.o w.v vit denat alc vol conc subsp
29
+ min max ca
30
+ ].map {|s| /\A#{Regexp.escape(s)}\b\.?/i}
31
+ ).freeze
17
32
 
18
33
  def initialize(s, index: 0)
19
34
  @s = s # input string
@@ -5,8 +5,9 @@ module FoodIngredientParser::Loose::Transform
5
5
 
6
6
  rule amount_from_name
7
7
  # just amount, amount in front or at the end
8
- ws* amount:amount ws+ name:(.*) /
9
- ws* amount:amount ws* /
8
+ ws* amount:amount ws+ name:(.*) /
9
+ ws* amount:amount_simple_percent ws* name:(.*) /
10
+ ws* amount:amount ws* /
10
11
  ws* name:( !amount word ( ws+ !amount word )* )+ ws* amount:amount ws*
11
12
  end
12
13
  end
@@ -7,8 +7,9 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- MATCH_RE = /\A\s*(e[0-9]{3}[a-z]?)(?:#{SPLIT_RE}(e[0-9]{3}[a-z]?))+\s*\z/i.freeze
10
+ SPLIT_RE = /\s*-\s*/.freeze
11
+ SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
12
+ MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
12
13
 
13
14
  def self.transform!(node)
14
15
  new(node).transform!
@@ -9,6 +9,10 @@ module FoodIngredientParser::Strict::Grammar
9
9
  amount:amount_simple <AmountNode>
10
10
  end
11
11
 
12
+ rule amount_simple_percent
13
+ amount:(amount_simple_number ws* percent) <AmountNode>
14
+ end
15
+
12
16
  rule amount_simple
13
17
  ( (
14
18
  'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
@@ -17,21 +21,30 @@ module FoodIngredientParser::Strict::Grammar
17
21
  ) ws* )?
18
22
  amount_simple_quantity
19
23
  ( ws+ (
20
- 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
24
+ 'of a'i / 'of'i / 'or less of'i / 'or more of'i /
25
+ 'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
21
26
  'min.'i / 'min'i / 'max.'i / 'max'i
22
27
  ) )?
23
28
  end
24
29
 
25
30
  rule amount_simple_quantity
26
- amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
31
+ amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
27
32
  end
28
33
 
29
34
  rule amount_simple_number
30
- ( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
35
+ ( amount_simple_comparator ws* )? number
36
+ end
37
+
38
+ rule amount_simple_comparator
39
+ '=' ws* [<>] /
40
+ [<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
41
+ [±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
31
42
  end
32
43
 
33
44
  rule amount_simple_unit
34
- ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
45
+ ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
46
+ ( ws 'vol'i ( !char / '.' ) )?
47
+ ( ws* '℮' )?
35
48
  end
36
49
  end
37
50
  end
@@ -10,17 +10,22 @@ module FoodIngredientParser::Strict::Grammar
10
10
  end
11
11
 
12
12
  rule char
13
- [[:alnum:]] /
13
+ !mark [[:alnum:]] /
14
14
  fraction /
15
- [-/\`'´’+=_{}&] /
16
- [®™] /
17
- [¿?] / # weird characters turning up in names (e.g. encoding issues)
15
+ [-/\`'"´‘’+=_{}&] /
16
+ [®©™♣] /
17
+ [¿?¯] / # weird characters turning up in names (e.g. encoding issues)
18
18
  [₁₂₃₄₅₆₇₈₉] # can occur with vitamins
19
19
  end
20
20
 
21
21
  rule mark
22
22
  # mark referencing a footnote
23
- [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [†‡•°#^] / '*'+ / '(' ws* ( [†‡•°#^] / '*'+ ) ws* ')'
23
+ [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
24
+ '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
25
+ [˄^] digit /
26
+ [†‡⁺•°▪◊#˄^~˛] /
27
+ '*'+ /
28
+ '(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
24
29
  end
25
30
 
26
31
  rule digit
@@ -28,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
28
33
  end
29
34
 
30
35
  rule fraction
31
- [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
36
+ [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
37
+ digit+ '/' digit+
32
38
  end
33
39
 
34
40
  rule percent
@@ -48,7 +54,26 @@ module FoodIngredientParser::Strict::Grammar
48
54
  end
49
55
 
50
56
  rule and
51
- ( 'and' / 'en' / 'und' / '&' ) !char
57
+ ( 'and' / 'en' / 'und' ) !char / '&'
58
+ end
59
+
60
+ rule e_number
61
+ ( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
+ ![[:alnum:]] / ( ws* '(' 'i'i+ ')' ) # e.g. "E450 (iii)"
63
+ end
64
+
65
+ rule chem_systematic_name
66
+ ( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
67
+ ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
68
+ end
69
+
70
+ rule chem_systematic_name_word
71
+ [A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
72
+ end
73
+
74
+ rule chem_systematic_name_num
75
+ digit+ [RH] /
76
+ digit+ ( ',' digit+ )* '\''?
52
77
  end
53
78
 
54
79
  rule abbrev
@@ -105,8 +130,14 @@ module FoodIngredientParser::Strict::Grammar
105
130
  'w.o'i /
106
131
  'w.v'i /
107
132
  # not auto-generated additions
108
- 'vit'i /
109
- 'denat'i
133
+ 'nr.'i /
134
+ 'vit'i / # vitamin
135
+ 'denat'i / # denaturated
136
+ 'alc'i / # alcohol
137
+ 'vol'i / # volume
138
+ 'conc'i / # concentration
139
+ 'subsp'i / # subspecies
140
+ 'www.'i [-_\/:%.A-Za-z0-9]+
110
141
  )
111
142
  '.'? ![[:alpha:]]
112
143
  end
@@ -116,7 +147,15 @@ module FoodIngredientParser::Strict::Grammar
116
147
  (
117
148
  'N°'i /
118
149
  '°C'i /
119
- ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+
150
+ ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
151
+ 'L(+)' ('-' / ws) [[:alnum:]]+ /
152
+ 'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
153
+ 'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i /
154
+ 'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
155
+ 'nucifera' ws+ 'L.'i /
156
+ 'type'i ws+ '"' [0-9]+ '"' /
157
+ e_number /
158
+ chem_systematic_name
120
159
  ) ![[:alpha:]]
121
160
  end
122
161
  end
@@ -21,7 +21,7 @@ module FoodIngredientParser::Strict::Grammar
21
21
 
22
22
  rule ingredient_nested_contains
23
23
  'contains'i /
24
- 'bevat'i
24
+ 'bevat'i / 'bevat o.a.'i / 'o.a.'i / 'met'i
25
25
  end
26
26
 
27
27
  end
@@ -9,13 +9,14 @@ module FoodIngredientParser::Strict::Grammar
9
9
  end
10
10
 
11
11
  rule ingredient_simple_with_amount
12
- pre:( '{' ws* )? amount:amount ws+ ing:ingredient_simple <IngredientNode> /
12
+ pre:( '{' ws* )? amount:amount ws+ ing:ingredient_simple <IngredientNode> /
13
+ pre:( '{' ws* )? amount:amount_simple_percent ws* ing:ingredient_simple <IngredientNode> /
13
14
  ing:ingredient_simple ws* amount:amount post:( ws* '}' )? (ws? mark:mark)? <IngredientNode> /
14
15
  ing:ingredient_simple <IngredientNode>
15
16
  end
16
17
 
17
18
  rule ingredient_simple_e_number
18
- name:( [Ee] [0-9] [0-9] [0-9] [a-zA-Z]? ) ![a-zA-Z0-9] <IngredientNode>
19
+ name:e_number <IngredientNode>
19
20
  end
20
21
 
21
22
  end
@@ -5,8 +5,12 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
8
10
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
9
11
  contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
10
14
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
11
15
  contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
12
16
  contains:( ws* list_coloned_ingredient ) <ListNode>
@@ -40,7 +40,7 @@ module FoodIngredientParser::Strict::Grammar
40
40
  end
41
41
 
42
42
  rule root_mark_sentences_in_list
43
- ( ( ws* [,.;] / ws ) ws* root_mark_sentence_in_list )+
43
+ ( ( ws* [,.;] / ws )+ root_mark_sentence_in_list )+
44
44
  end
45
45
 
46
46
  rule root_mark_sentence_in_list
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.2'
3
- VERSION_DATE = '2018-09-28'
2
+ VERSION = '1.1.7'
3
+ VERSION_DATE = '2020-12-28'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-28 00:00:00.000000000 Z
11
+ date: 2020-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,8 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubyforge_project:
91
- rubygems_version: 2.6.13
90
+ rubygems_version: 3.0.3
92
91
  signing_key:
93
92
  specification_version: 4
94
93
  summary: Parser for ingredient lists found on food products.