food_ingredient_parser 1.1.5 → 1.1.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 8c80d036dbee183ed2fd1a8cc4e513e54318d142
4
- data.tar.gz: 6cfcd29eacb9e99a9be9a66a90446f47f714ae4b
2
+ SHA256:
3
+ metadata.gz: a56d22b7e67a3a913b051bcbda8da885ddd467dc53f5a0df0faa5b40759a1f35
4
+ data.tar.gz: 427dd79c9f9203dc7901ead6264e08c05183d02aec266ac1d3bff930a5ba1dcd
5
5
  SHA512:
6
- metadata.gz: 75b1f91e5db6bcfcc24ad8eabe16b541663e5e344604c31b884ab1a5633c95dc045cdee2d384f5434f5143b778659cecd69917b9adb16e25341978380e486bcc
7
- data.tar.gz: 7254cca971a558bda2ae6e996cc4d121fc5138f09d67c847bb44eec87421aeea39bf18a771137eb3b0b2bb734bed058a1d2347f815b9edc1b23e0b069d83a381
6
+ metadata.gz: 0b07032ade3a55ce208bcb0c069223b41aee21f185a2b6a9bb91332881dfef8e1d829ae966097e48ffdba9984517be43b10bd027099f9bdce04e3a4c6fc41ca8
7
+ data.tar.gz: ebdf452a09d54b151ce8cfa9bb65b4477dd1afc81bfc5cd1d94055f726d387f522dd04a3e11b73d4b26222a18bc1068912a81c8e2e3cd8439b0cee1c1ec290d7
data/README.md CHANGED
@@ -69,6 +69,7 @@ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
69
69
  -r, --parser PARSER Use specific parser (strict, loose).
70
70
  -e, --[no-]escape Escape newlines
71
71
  -c, --[no-]color Use color
72
+ --[no-]html Print as HTML with parsing markup
72
73
  -v, --[no-]verbose Show more data (parsed tree).
73
74
  --version Show program version.
74
75
  -h, --help Show this help
@@ -103,6 +104,9 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
103
104
  SyntaxNode offset=6, ""
104
105
  {:contains=>[{:name=>"tomato"}]}
105
106
 
107
+ $ bin/food_ingredient_parser --html -s "tomato"
108
+ <div class="root"><span class='depth0'><span class='name'>tomato</span></span></div>
109
+
106
110
  $ food_ingredient_parser -v -r loose -s "tomato"
107
111
  "tomato"
108
112
  Node interval=0..5
@@ -2,6 +2,7 @@
2
2
  #
3
3
  # Parser for food ingredient lists.
4
4
  #
5
+ require 'cgi'
5
6
  require 'optparse'
6
7
 
7
8
  $:.push(File.expand_path(File.dirname(__FILE__) + "/../lib"))
@@ -31,22 +32,31 @@ def colorize(color, s)
31
32
  end
32
33
  end
33
34
 
34
- def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
35
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
35
36
  parsed ||= parser.parse(s)
36
37
 
37
38
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
38
39
 
39
- puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if verbosity > 0
40
+ puts colorize(color && "0;32", escape ? s.gsub("\n", "\\n") : s) if !html && verbosity > 0
40
41
 
41
- if parsed
42
+ if !html && parsed
42
43
  puts(parsed.inspect) if verbosity > 1
43
44
  pp(parsed.to_h, color: color) if verbosity > 0
44
- else
45
+
46
+ elsif !html && !parsed
45
47
  puts "(no result: #{parser.parser.failure_reason})" if verbosity > 0
48
+
49
+ elsif html && parsed
50
+ puts('<div class="root">' + parsed.to_html + '</div>') if verbosity > 0
51
+
52
+ else
53
+ puts('<div class="root">' + CGI.escapeHTML(parsed) + '</div>') if verbosity > 0
46
54
  end
55
+
56
+ return !!parsed
47
57
  end
48
58
 
49
- def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
59
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, html: false)
50
60
  count_parsed = count_noresult = 0
51
61
  File.foreach(path) do |line|
52
62
  next if line =~ /^#/ # comment
@@ -57,12 +67,13 @@ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: fa
57
67
  count_parsed += 1 if parsed
58
68
  count_noresult += 1 unless parsed
59
69
 
60
- parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color)
70
+ parse_single(line, parsed, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html)
61
71
  end
62
72
 
63
73
  pct_parsed = 100.0 * count_parsed / (count_parsed + count_noresult)
64
74
  pct_noresult = 100.0 * count_noresult / (count_parsed + count_noresult)
65
75
  puts "parsed #{colorize(color && "1;32", count_parsed)} (#{pct_parsed.round(1)}%), no result #{colorize(color && "1;31", count_noresult)} (#{pct_noresult.round(1)}%)"
76
+ return count_noresult
66
77
  end
67
78
 
68
79
  verbosity = 1
@@ -72,6 +83,7 @@ print = nil
72
83
  parser_name = :strict
73
84
  escape = false
74
85
  color = true
86
+ html = false
75
87
  PARSERS = {
76
88
  strict: FoodIngredientParser::Strict::Parser,
77
89
  loose: FoodIngredientParser::Loose::Parser
@@ -92,6 +104,7 @@ OptionParser.new do |opts|
92
104
  opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
93
105
  opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
94
106
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
107
+ opts.on( "--[no-]html", "Print as HTML with parsing markup") {|e| html = !!e }
95
108
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
96
109
  opts.on( "--version", "Show program version.") do
97
110
  puts("food_ingredient_parser v#{FoodIngredientParser::VERSION}")
@@ -108,8 +121,10 @@ if strings.any? || files.any?
108
121
  STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
109
122
  exit(1)
110
123
  end
111
- strings.each {|s| parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
112
- files.each {|f| parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
124
+ success = true
125
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) }
126
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, html: html) == 0 }
127
+ success or exit(1)
113
128
  else
114
129
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
115
130
  end
@@ -2,13 +2,14 @@ module FoodIngredientParser
2
2
  module Cleaner
3
3
 
4
4
  def self.clean(s)
5
- s.gsub!("\u00ad", "") # strip soft hyphen
6
- s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
7
- s.gsub!("aÄs", "aïs") # encoding issue for maïs
8
- s.gsub!("ï", "ï") # encoding issue
9
- s.gsub!("ë", "ë") # encoding issue
10
- s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
11
- s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
5
+ s.gsub!(/(_x005f_|_)x000d_/i, "\n") # fix sometimes encoding for newline
6
+ s.gsub!("\u00ad", "") # strip soft hyphen
7
+ s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
8
+ s.gsub!("", ",") # normalize unicode comma
9
+ s.gsub!("aÄs", "aïs") # encoding issue for maïs
10
+ s.gsub!("ï", "ï") # encoding issue
11
+ s.gsub!("ë", "ë") # encoding issue
12
+ s.gsub!(/\A\s*(["']+)(.*)\1\s*\z/, '\2') # enclosing quotation marks
12
13
  s
13
14
  end
14
15
 
@@ -4,7 +4,7 @@ module FoodIngredientParser::Loose
4
4
  class Scanner
5
5
 
6
6
  SEP_CHARS = "|;,.".freeze
7
- MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°▪◊#^*".freeze
7
+ MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡⁺•°▪◊#^˄*~".freeze
8
8
  PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\b\s*[:;.]?\s*/i.freeze
9
9
  NOTE_RE = /\A\b(dit product kan\b|deze verpakking kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|allergie[- ]informatie(\s*:|\b)|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b|voor [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bis [0-9,.]+ (g\.?|gr\.?|ram|ml).*\bgebruikt\b)/i.freeze
10
10
  # Keep in sync with +abbrev+ in the +Common+ grammar, plus relevant ones from the +Amount+ grammar.
@@ -17,8 +17,9 @@ module FoodIngredientParser::Loose
17
17
  L\.\s+rhamnosus\b | L\.\s+acidophilus\b | L\.\s+casei\b | B\.\s+lactis | A\.\s+oryzae |
18
18
  S\.\s+thermophilus\b | L\.\sbulgaricus\b |
19
19
  T\.\s*aestivum\b(\s+vitt\.)? |
20
+ nucifera\s+L\. |
20
21
  type\s+"\d+" |
21
- E-e?\d{3}[a-z]?\s*\(i+\) |
22
+ E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\]) |
22
23
  www\.[-_\/:%.A-Za-z0-9]+
23
24
  )/xi,
24
25
  *%w[
@@ -7,8 +7,8 @@ module FoodIngredientParser::Loose
7
7
  #
8
8
  # @note mark and amount is lost, this is not expected on e-numbers
9
9
 
10
- SPLIT_RE = /\s*-\s*/.freeze
11
- SINGLE_RE = /E-?\d{3}[a-z]?(?:\s*\(i+\))?/i.freeze
10
+ SPLIT_RE = /\s*(-|\ben\b|\band\b|\bund\b|\bet\b)\s*/.freeze
11
+ SINGLE_RE = /E(-|\s+)?\d{3}[a-z]?\s*(\([iv]+\)|\[[iv]+\])?/i.freeze
12
12
  MATCH_RE = /\A\s*(#{SINGLE_RE})(?:#{SPLIT_RE}(#{SINGLE_RE}))+\s*\z/i.freeze
13
13
 
14
14
  def self.transform!(node)
@@ -21,22 +21,28 @@ module FoodIngredientParser::Strict::Grammar
21
21
  ) ws* )?
22
22
  amount_simple_quantity
23
23
  ( ws+ (
24
- 'of'i / 'or less of'i / 'or more of'i /
25
- 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
24
+ 'of a'i / 'of'i / 'or less of'i / 'or more of'i /
25
+ 'van een'i / 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
26
26
  'min.'i / 'min'i / 'max.'i / 'max'i
27
27
  ) )?
28
28
  end
29
29
 
30
30
  rule amount_simple_quantity
31
- amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ws* amount_simple_unit
31
+ amount_simple_number ( ws* amount_simple_unit? ws* dash ws* amount_simple_number )? ( ws* amount_simple_unit )?
32
32
  end
33
33
 
34
34
  rule amount_simple_number
35
- ( [±∓~∼∽≂≃≈≲≤<>≥≳] ws* )? number
35
+ ( amount_simple_comparator ws* )? number
36
+ end
37
+
38
+ rule amount_simple_comparator
39
+ '=' ws* [<>] /
40
+ [<>] ws* ( '=' / 'of gelijk aan'i !char / 'or equal to'i !char ) /
41
+ [±∓~∼∽≂≃≈≲≤<>≥≳] / '+/-' / '-/+'
36
42
  end
37
43
 
38
44
  rule amount_simple_unit
39
- ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i ) !char ) )
45
+ ( percent / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'gr'i / 'g'i / 'ppm'i ) !char ) )
40
46
  ( ws 'vol'i ( !char / '.' ) )?
41
47
  ( ws* '℮' )?
42
48
  end
@@ -12,15 +12,20 @@ module FoodIngredientParser::Strict::Grammar
12
12
  rule char
13
13
  !mark [[:alnum:]] /
14
14
  fraction /
15
- [-/\`'´‘’+=_{}&] /
16
- [®™] /
17
- [¿?] / # weird characters turning up in names (e.g. encoding issues)
15
+ [-/\`'"´‘’+=_{}&] /
16
+ [®©™♣] /
17
+ [¿?¯] / # weird characters turning up in names (e.g. encoding issues)
18
18
  [₁₂₃₄₅₆₇₈₉] # can occur with vitamins
19
19
  end
20
20
 
21
21
  rule mark
22
22
  # mark referencing a footnote
23
- [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [†‡•°▪◊#^] / '*'+ / '(' ws* ( [†‡•°▪◊#^] / '*'+ ) ws* ')'
23
+ [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
24
+ '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
25
+ [˄^] digit /
26
+ [†‡⁺•°▪◊#˄^~˛] /
27
+ '*'+ /
28
+ '(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
24
29
  end
25
30
 
26
31
  rule digit
@@ -28,7 +33,8 @@ module FoodIngredientParser::Strict::Grammar
28
33
  end
29
34
 
30
35
  rule fraction
31
- [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]
36
+ [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
37
+ digit+ '/' digit+
32
38
  end
33
39
 
34
40
  rule percent
@@ -52,8 +58,22 @@ module FoodIngredientParser::Strict::Grammar
52
58
  end
53
59
 
54
60
  rule e_number
55
- ( 'E'i '-'? [0-9] [0-9] [0-9] [[:alpha:]]? )
56
- ![[:alnum:]] / ( ws* '(' 'i'i+ ')' ) # e.g. "E450 (iii)"
61
+ ( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
62
+ ( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
63
+ end
64
+
65
+ rule chem_systematic_name
66
+ ( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
67
+ ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
68
+ end
69
+
70
+ rule chem_systematic_name_word
71
+ [A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
72
+ end
73
+
74
+ rule chem_systematic_name_num
75
+ digit+ [RH] /
76
+ digit+ ( ',' digit+ )* '\''?
57
77
  end
58
78
 
59
79
  rule abbrev
@@ -110,6 +130,7 @@ module FoodIngredientParser::Strict::Grammar
110
130
  'w.o'i /
111
131
  'w.v'i /
112
132
  # not auto-generated additions
133
+ 'nr.'i /
113
134
  'vit'i / # vitamin
114
135
  'denat'i / # denaturated
115
136
  'alc'i / # alcohol
@@ -131,8 +152,10 @@ module FoodIngredientParser::Strict::Grammar
131
152
  'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
132
153
  'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i /
133
154
  'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
155
+ 'nucifera' ws+ 'L.'i /
134
156
  'type'i ws+ '"' [0-9]+ '"' /
135
- e_number
157
+ e_number /
158
+ chem_systematic_name
136
159
  ) ![[:alpha:]]
137
160
  end
138
161
  end
@@ -5,15 +5,15 @@ module FoodIngredientParser::Strict::Grammar
5
5
  include Ingredient
6
6
 
7
7
  rule list_coloned
8
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ws* list_coloned_ingredient ) <ListNode> /
9
- contains:( ( ws* list_coloned_ingredient ws* '.,')+ ) <ListNode> /
10
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
- contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ws* list_coloned_ingredient ) <ListNode> /
13
- contains:( ( ws* list_coloned_ingredient ws* ';,')+ ) <ListNode> /
14
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
- contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
- contains:( ws* list_coloned_ingredient ) <ListNode>
8
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
9
+ contains:( ( ws* list_coloned_ingredient ws* '.' ws* ',' )+ ) <ListNode> /
10
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ws* list_coloned_ingredient ) <ListNode> /
11
+ contains:( ( ws* list_coloned_ingredient ws* '.' )+ ) <ListNode> /
12
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ws* list_coloned_ingredient ) <ListNode> /
13
+ contains:( ( ws* list_coloned_ingredient ws* ';' ws* ',' )+ ) <ListNode> /
14
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ws* list_coloned_ingredient ) <ListNode> /
15
+ contains:( ( ws* list_coloned_ingredient ws* ';' )+ ) <ListNode> /
16
+ contains:( ws* list_coloned_ingredient ) <ListNode>
17
17
  end
18
18
 
19
19
  rule list_coloned_inner_list
@@ -22,7 +22,7 @@ module FoodIngredientParser::Strict::Grammar
22
22
  end
23
23
 
24
24
  rule list_coloned_ingredient
25
- ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? <IngredientNode> /
25
+ ing:ingredient_simple_with_amount ws* ':' ws* amount:amount post:( ws* '}' )? !( ws* word ) <IngredientNode> /
26
26
  ing:ingredient_simple_with_amount ws* ':' post:( ws* '}' )? ws* contains:list_coloned_inner_list <NestedIngredientNode>
27
27
  end
28
28
 
@@ -40,7 +40,7 @@ module FoodIngredientParser::Strict::Grammar
40
40
  end
41
41
 
42
42
  rule root_mark_sentences_in_list
43
- ( ( ws* [,.;] / ws ) ws* root_mark_sentence_in_list )+
43
+ ( ( ws* [,.;] / ws )+ root_mark_sentence_in_list )+
44
44
  end
45
45
 
46
46
  rule root_mark_sentence_in_list
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.1.5'
3
- VERSION_DATE = '2019-11-14'
2
+ VERSION = '1.1.10'
3
+ VERSION_DATE = '2021-03-23'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.5
4
+ version: 1.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-14 00:00:00.000000000 Z
11
+ date: 2021-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -87,8 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
- rubyforge_project:
91
- rubygems_version: 2.6.13
90
+ rubygems_version: 3.0.3
92
91
  signing_key:
93
92
  specification_version: 4
94
93
  summary: Parser for ingredient lists found on food products.