food_ingredient_parser 1.0.0.pre.5 → 1.0.0.pre.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +44 -8
  3. data/bin/food_ingredient_parser +13 -5
  4. data/lib/food_ingredient_parser/cleaner.rb +16 -0
  5. data/lib/food_ingredient_parser/loose/node.rb +60 -0
  6. data/lib/food_ingredient_parser/loose/parser.rb +24 -0
  7. data/lib/food_ingredient_parser/loose/scanner.rb +191 -0
  8. data/lib/food_ingredient_parser/loose/transform/amount.rb +70 -0
  9. data/lib/food_ingredient_parser/loose/transform/amount_from_name.treetop +13 -0
  10. data/lib/food_ingredient_parser/{grammar → strict/grammar}/amount.treetop +6 -5
  11. data/lib/food_ingredient_parser/{grammar → strict/grammar}/common.treetop +1 -1
  12. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient.treetop +1 -1
  13. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_coloned.treetop +1 -1
  14. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_nested.treetop +1 -1
  15. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_simple.treetop +1 -1
  16. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list.treetop +1 -1
  17. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list_coloned.treetop +1 -1
  18. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list_newlined.treetop +1 -1
  19. data/lib/food_ingredient_parser/{grammar → strict/grammar}/root.treetop +1 -1
  20. data/lib/food_ingredient_parser/strict/nodes.rb +74 -0
  21. data/lib/food_ingredient_parser/{parser.rb → strict/parser.rb} +3 -15
  22. data/lib/food_ingredient_parser/strict/to_html.rb +54 -0
  23. data/lib/food_ingredient_parser/version.rb +2 -2
  24. data/lib/food_ingredient_parser.rb +2 -1
  25. metadata +22 -16
  26. data/lib/food_ingredient_parser/nodes.rb +0 -72
  27. data/lib/food_ingredient_parser/to_html.rb +0 -52
  28. /data/lib/food_ingredient_parser/{grammar.rb → strict/grammar.rb} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8725e4ed3763020de6b46cad6709ce05aca0b77f
4
- data.tar.gz: fedb82af99346e8db38a3beaca5914ab44be197a
3
+ metadata.gz: 54bdb9187f9a2dfbec67737ddc2a3ad90f4ca058
4
+ data.tar.gz: fcfc99674e0f58801ca3a375acebe91ba3f80c84
5
5
  SHA512:
6
- metadata.gz: 114feb403f87140f2eccc21860c9a75aff4cff3583f3046f9815299f90dc6374c73b3b9ee60d682c3a9f91b2803ab3fc6c934f3255716986a494fbb51c9e5564
7
- data.tar.gz: 8f94825d627ab8068fb3dfd6c20f61a20fc8612f06ec6a760d33e46def103ca11497e364d9e3a7c4720647794e644a7253ba6e226d984801a55e089a5961b753
6
+ metadata.gz: 773526e862a74f04614486f3542de0c89f8663e10f959978a7d3a2e1ba8703e8a9ae93bde6b043a62326be68956733a30e6c01d85b6578dba0abc9064590fb18
7
+ data.tar.gz: e3296ae3222745f20727eed70bd1a9dac84a1714f1bdc64c2cd524bfbd6589a470afea33eab1e4ce717dbec9f4b3b3d8d9350025d13a4046e78cea3d0aea9b6d
data/README.md CHANGED
@@ -22,11 +22,11 @@ require 'food_ingredient_parser'
22
22
  s = "Water* 60%, suiker 30%, voedingszuren: citroenzuur, appelzuur, zuurteregelaar: E576/E577, " \
23
23
  + "natuurlijke citroen-limoen aroma's 0,2%, zoetstof: steviolglycosiden, * = Biologisch. " \
24
24
  + "E = door de E.U. goedgekeurde toevoeging."
25
- parser = FoodIngredientParser::Parser.new
25
+ parser = FoodIngredientParser::Strict::Parser.new
26
26
  puts parser.parse(s).to_h.inspect
27
27
  ```
28
28
  Results in
29
- ```
29
+ ```ruby
30
30
  {
31
31
  :contains=>[
32
32
  {:name=>"Water", :amount=>"60%", :mark=>"*"},
@@ -58,14 +58,15 @@ running this from the source tree, use `bin/food_ingredient_parser` instead.
58
58
 
59
59
  ```
60
60
  $ food_ingredient_parser -h
61
- Usage: food_ingredient_parser [options] --file|-f <filename>
62
- food_ingredient_parser [options] --string|-s <ingredients>
61
+ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
62
+ bin/food_ingredient_parser [options] --string|-s <ingredients>
63
63
 
64
64
  -f, --file FILE Parse all lines of the file as ingredient lists.
65
65
  -s, --string INGREDIENTS Parse specified ingredient list.
66
66
  -q, --[no-]quiet Only show summary.
67
67
  -p, --parsed Only show lines that were successfully parsed.
68
- -e, --escape Escape newlines
68
+ -r, --parser PARSER Use specific parser (strict, loose).
69
+ -e, --[no-]escape Escape newlines
69
70
  -c, --[no-]color Use color
70
71
  -n, --noresult Only show lines that had no result.
71
72
  -v, --[no-]verbose Show more data (parsed tree).
@@ -102,6 +103,12 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
102
103
  SyntaxNode offset=6, ""
103
104
  {:contains=>[{:name=>"tomato"}]}
104
105
 
106
+ $ food_ingredient_parser -v -r loose -s "tomato"
107
+ "tomato"
108
+ Node interval=0..5
109
+ Node interval=0..5, name="tomato"
110
+ {:contains=>[{:name=>"tomato"}]}
111
+
105
112
  $ food_ingredient_parser -q -f data/test-cases
106
113
  parsed 35 (100.0%), no result 0 (0.0%)
107
114
  ```
@@ -114,12 +121,12 @@ When ingredient lists are entered manually, it can be very useful to show how th
114
121
  recognized. This can help understanding why a certain ingredients list cannot be parsed.
115
122
 
116
123
  For this you can use the `to_html` method on the parsed output, which returns the original
117
- text, augmented with CSS classes for different parts.
124
+ text, augmented with CSS classes for different parts. (Available for strict parser only.)
118
125
 
119
126
  ```ruby
120
127
  require 'food_ingredient_parser'
121
128
 
122
- parsed = FoodIngredientParser::Parser.new.parse("Saus (10% tomaat*, zout). * = bio")
129
+ parsed = FoodIngredientParser::Strict::Parser.new.parse("Saus (10% tomaat*, zout). * = bio")
123
130
  puts parsed.to_html
124
131
  ```
125
132
 
@@ -138,9 +145,38 @@ For an example of an interactive editor, see [examples/editor.rb](examples/edito
138
145
 
139
146
  ![editor example screenshot](examples/editor-screenshot.png)
140
147
 
148
+ ## Loose parser
149
+
150
+ The strict parser only parses ingredient lists that conform to one of the many different
151
+ formats expected. If you'd like to return a result always, even if that is not necessarily
152
+ completely correct, you can use the _loose_ parser. This does not use Treetop, but looks
153
+ at the input character for character and tries to make the best of it. Nevertheless, if you
154
+ just want to have _some_ result, this can still be very useful.
155
+
156
+ ```ruby
157
+ require 'food_ingredient_parser'
158
+
159
+ parsed = FoodIngredientParser::Loose::Parser.new.parse("Saus [10% tomaat*, (zout); peper.")
160
+ puts parsed.to_h
161
+ ```
162
+
163
+ Even though the strict parser would not give a result, the loose parser returns:
164
+ ```ruby
165
+ {
166
+ :contains=>[
167
+ {:name=>"Saus", :contains=>[
168
+ {:name=>"tomaat", :mark=>"*", :amount=>"10%"},
169
+ {:contains=>[{:name=>"zout"}]},
170
+ {:name=>"peper"}
171
+ ]}
172
+ ]
173
+ }
174
+ ```
175
+
141
176
  ## Test data
142
177
 
143
178
  [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
144
179
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
145
180
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
146
- Currently almost three quarter is recognized and parsed. We aim to reach at least 90%.
181
+ The strict parser currently parses about three quarter, while the loose parser returns
182
+ something for all of them.
@@ -31,8 +31,7 @@ def colorize(color, s)
31
31
  end
32
32
  end
33
33
 
34
- def parse_single(s, parsed=nil, parser: nil, verbosity: 1, print: nil, escape: false, color: false)
35
- parser ||= FoodIngredientParser::Parser.new
34
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
36
35
  parsed ||= parser.parse(s)
37
36
 
38
37
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
@@ -47,7 +46,7 @@ def parse_single(s, parsed=nil, parser: nil, verbosity: 1, print: nil, escape: f
47
46
  end
48
47
  end
49
48
 
50
- def parse_file(path, parser: nil, verbosity: 1, print: nil, escape: false, color: false)
49
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
51
50
  count_parsed = count_noresult = 0
52
51
  File.foreach(path) do |line|
53
52
  next if line =~ /^#/ # comment
@@ -70,8 +69,13 @@ verbosity = 1
70
69
  files = []
71
70
  strings = []
72
71
  print = nil
72
+ parser_name = :strict
73
73
  escape = false
74
74
  color = true
75
+ PARSERS = {
76
+ strict: FoodIngredientParser::Strict::Parser,
77
+ loose: FoodIngredientParser::Loose::Parser
78
+ }
75
79
  OptionParser.new do |opts|
76
80
  opts.banner = <<-EOF.gsub(/^ /, '')
77
81
  Usage: #{$0} [options] --file|-f <filename>
@@ -84,7 +88,8 @@ OptionParser.new do |opts|
84
88
 
85
89
  opts.on("-q", "--[no-]quiet", "Only show summary.") {|q| verbosity = q ? 0 : 1 }
86
90
  opts.on("-p", "--parsed", "Only show lines that were successfully parsed.") {|p| print = :parsed }
87
- opts.on("-e", "--escape", "Escape newlines") {|e| escape = true }
91
+ opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
92
+ opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
88
93
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
89
94
  opts.on("-n", "--noresult", "Only show lines that had no result.") {|p| print = :noresult }
90
95
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
@@ -99,7 +104,10 @@ OptionParser.new do |opts|
99
104
  end.parse!
100
105
 
101
106
  if strings.any? || files.any?
102
- parser = FoodIngredientParser::Parser.new
107
+ unless parser = PARSERS[parser_name]&.new
108
+ STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
109
+ exit(1)
110
+ end
103
111
  strings.each {|s| parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
104
112
  files.each {|f| parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
105
113
  else
@@ -0,0 +1,16 @@
1
+ module FoodIngredientParser
2
+ module Cleaner
3
+
4
+ def self.clean(s)
5
+ s.gsub!("\u00ad", "") # strip soft hyphen
6
+ s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
7
+ s.gsub!("aÄs", "aïs") # encoding issue for maïs
8
+ s.gsub!("ï", "ï") # encoding issue
9
+ s.gsub!("ë", "ë") # encoding issue
10
+ s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
11
+ s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
12
+ s
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,60 @@
1
+ module FoodIngredientParser::Loose
2
+ # Parsing result.
3
+ class Node
4
+ attr_accessor :name, :mark, :amount, :contains, :notes
5
+ attr_reader :input, :interval, :auto_close
6
+
7
+ def initialize(input, interval, auto_close: false)
8
+ @input = input
9
+ @interval = interval.is_a?(Range) ? interval : ( interval .. interval )
10
+ @auto_close = auto_close
11
+ @contains = []
12
+ @notes = []
13
+ @name = @mark = @amount = nil
14
+ end
15
+
16
+ def ends(index)
17
+ @interval = @interval.first .. index
18
+ end
19
+
20
+ def <<(child)
21
+ @contains << child
22
+ end
23
+
24
+ def text_value
25
+ @input[@interval]
26
+ end
27
+
28
+ def to_h
29
+ r = {}
30
+ r[:name] = name.text_value.strip if name && name.text_value.strip != ''
31
+ r[:mark] = mark.text_value.strip if mark
32
+ r[:amount] = amount.text_value.strip if amount
33
+ r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
34
+ r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
35
+ r
36
+ end
37
+
38
+ def inspect(indent="", variant="")
39
+ inspect_self(indent, variant) +
40
+ inspect_children(indent)
41
+ end
42
+
43
+ def inspect_self(indent="", variant="")
44
+ [
45
+ indent + "Node#{variant} interval=#{@interval}",
46
+ name ? "name=#{name.text_value.strip.inspect}" : nil,
47
+ mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
48
+ amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
49
+ auto_close ? "auto_close" : nil
50
+ ].compact.join(", ")
51
+ end
52
+
53
+ def inspect_children(indent="")
54
+ [
55
+ *contains.map {|child| "\n" + child.inspect(indent + " ") },
56
+ *notes.map {|note| "\n" + note.inspect(indent + " ", "(note)") }
57
+ ].join("")
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,24 @@
1
+ require_relative '../cleaner'
2
+ require_relative 'scanner'
3
+ require_relative 'transform/amount'
4
+
5
+ module FoodIngredientParser::Loose
6
+ class Parser
7
+
8
+ # Create a new food ingredient stream parser
9
+ # @return [FoodIngredientParser::StreamParser]
10
+ def initialize
11
+ end
12
+
13
+ # Parse food ingredient list text into a structured representation.
14
+ #
15
+ # @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
16
+ # @return [FoodIngredientParser::Loose::Node] structured representation of food ingredients
17
+ def parse(s, clean: true, **options)
18
+ s = FoodIngredientParser::Cleaner.clean(s) if clean
19
+ n = Scanner.new(s).scan
20
+ n = Transform::Amount.transform!(n) if n
21
+ n
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,191 @@
1
+ require_relative 'node'
2
+
3
+ module FoodIngredientParser::Loose
4
+ class Scanner
5
+
6
+ SEP_CHARS = "|;,.".freeze
7
+ MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
8
+ PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\s*[:;.]\s*/i.freeze
9
+ NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
10
+
11
+ def initialize(s, index: 0)
12
+ @s = s # input string
13
+ @i = index # current index in string
14
+ @cur = nil # current node we're populating
15
+ @ancestors = [Node.new(@s, @i)] # nesting hierarchy
16
+ @iterator = :beginning # scan_iteration_<iterator> to use for parsing
17
+ @dest = :contains # append current node to this attribute on parent
18
+ end
19
+
20
+ def scan
21
+ loop do
22
+ method(:"scan_iteration_#{@iterator}").call
23
+ end
24
+
25
+ close_all_ancestors
26
+ @ancestors.first.ends(@i-1)
27
+ @ancestors.first
28
+ end
29
+
30
+ private
31
+
32
+ def loop
33
+ while @i < @s.length
34
+ @i += 1 if yield != false
35
+ end
36
+ end
37
+
38
+ def scan_iteration_beginning
39
+ # skip over some common prefixes
40
+ m = @s[@i .. -1].match(PREFIX_RE)
41
+ @i += m.offset(0).last if m
42
+ # now continue with the standard parsing
43
+ @iterator = :standard
44
+ false
45
+ end
46
+
47
+ def scan_iteration_standard
48
+ if "([".include?(c) # open nesting
49
+ open_parent
50
+ elsif ")]".include?(c) # close nesting
51
+ add_child
52
+ close_parent
53
+ elsif is_notes_start? # usually a dot marks the start of notes
54
+ close_all_ancestors
55
+ @iterator = :notes
56
+ @dest = :notes
57
+ elsif is_sep? # separator
58
+ add_child
59
+ elsif ":".include?(c) # another open nesting
60
+ add_child
61
+ open_parent(auto_close: true)
62
+ @iterator = :colon
63
+ elsif is_mark? && !cur.mark # mark after ingredient
64
+ name_until_here
65
+ len = mark_len
66
+ cur.mark = Node.new(@s, @i .. @i+len-1)
67
+ @i += len - 1
68
+ else
69
+ cur # reference to record starting position
70
+ end
71
+ end
72
+
73
+ def scan_iteration_colon
74
+ if "/".include?(c) # slash separator in colon nesting only
75
+ add_child
76
+ elsif is_sep? # regular separator indicates end of colon nesting
77
+ add_child
78
+ close_parent
79
+ # revert to standard parsing from here on
80
+ @iterator = :standard
81
+ scan_iteration_standard
82
+ elsif "([]):".include?(c) # continue with deeper nesting level
83
+ # revert to standard parsing from here on
84
+ @iterator = :standard
85
+ scan_iteration_standard
86
+ else
87
+ # normal handling for this character
88
+ scan_iteration_standard
89
+ end
90
+ end
91
+
92
+ def scan_iteration_notes
93
+ if is_sep?(chars: ".") # dot means new note
94
+ add_child
95
+ else
96
+ cur
97
+ end
98
+ end
99
+
100
+ def c
101
+ @s[@i]
102
+ end
103
+
104
+ def parent
105
+ @ancestors.last
106
+ end
107
+
108
+ def cur
109
+ @cur ||= Node.new(@s, @i)
110
+ end
111
+
112
+ def is_mark?
113
+ mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
114
+ end
115
+
116
+ def is_sep?(chars: SEP_CHARS)
117
+ chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
118
+ end
119
+
120
+ def mark_len
121
+ i = @i
122
+ while @s[i] && MARK_CHARS.include?(@s[i])
123
+ i += 1
124
+ end
125
+ i - @i
126
+ end
127
+
128
+ def is_notes_start?
129
+ # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
130
+ if ( is_mark? && @s[@i+mark_len..-1] =~ /\A\s*=/ ) || # "* = Biologisch"
131
+ ( is_mark? && @s[@i-2..@i-1] =~ /\A\s\s/ ) || # " **Biologisch"
132
+ ( @s[@i..-1] =~ NOTE_RE ) # "E=", "Kan sporen van", ...
133
+ @i -= 1 # we want to include the mark in the note
134
+ true
135
+ # End of sentence
136
+ elsif dot_is_not_sep? && is_sep?(chars: ".")
137
+ true
138
+ else
139
+ false
140
+ end
141
+ end
142
+
143
+ def add_child
144
+ cur.ends(@i-1)
145
+ cur.name ||= Node.new(@s, cur.interval)
146
+ parent.send(@dest) << cur
147
+ @cur = nil
148
+ end
149
+
150
+ def open_parent(**options)
151
+ name_until_here
152
+ @ancestors << cur
153
+ @cur = Node.new(@s, @i + 1, **options)
154
+ end
155
+
156
+ def close_parent
157
+ return unless @ancestors.count > 1
158
+ @cur = @ancestors.pop
159
+ while @cur.auto_close
160
+ add_child
161
+ @cur = @ancestors.pop
162
+ end
163
+ end
164
+
165
+ def close_all_ancestors
166
+ while @ancestors.count > 1
167
+ add_child
168
+ close_parent
169
+ end
170
+ add_child
171
+ end
172
+
173
+ def name_until_here
174
+ cur.name ||= Node.new(@s, cur.interval.first .. @i-1)
175
+ end
176
+
177
+ def dot_is_not_sep?
178
+ # if separator is dot ".", don't use it for note detection
179
+ if @dot_is_not_sep.nil?
180
+ @dot_is_not_sep = begin
181
+ # @todo if another separator is found more often, dot is not a separator
182
+ num_words = @s.split(/\s+/).count
183
+ num_dots = @s.count(".")
184
+ # heuristic: 1/4+ of the words has a dot, with at least five words
185
+ num_words < 5 || 4 * num_dots < num_words
186
+ end
187
+ end
188
+ @dot_is_not_sep
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,70 @@
1
+ require 'treetop'
2
+ require_relative '../../strict/nodes'
3
+ Treetop.load File.dirname(__FILE__) + '/../../strict/grammar/common'
4
+ Treetop.load File.dirname(__FILE__) + '/../../strict/grammar/amount'
5
+ Treetop.load File.dirname(__FILE__) + '/amount_from_name'
6
+
7
+ require_relative '../node'
8
+
9
+ module FoodIngredientParser::Loose
10
+ module Transform
11
+ # Transforms node tree to extract amount into its own attribute.
12
+ class Amount
13
+ def self.transform!(node)
14
+ new(node).transform!
15
+ end
16
+
17
+ def initialize(node)
18
+ @node = node
19
+ @parser = FoodIngredientParser::Loose::Transform::AmountFromNameParser.new
20
+ end
21
+
22
+ def transform!
23
+ transform_name
24
+ transform_contains
25
+ @node
26
+ end
27
+
28
+ private
29
+
30
+ # Extract amount from name, if any.
31
+ def transform_name(node = @node)
32
+ if !node.amount && parsed = parse_amount(node.name&.text_value)
33
+ offset = node.name.interval.first
34
+
35
+ amount = parsed.amount.amount
36
+ node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
37
+
38
+ name = parsed.respond_to?(:name) && parsed.name
39
+ if name && name.interval.count > 0
40
+ node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
41
+ else
42
+ node.name = nil
43
+ end
44
+ end
45
+
46
+ # recursively transform contained nodes
47
+ node.contains&.each(&method(:transform_name))
48
+ end
49
+
50
+ # If first or last child is an amount, it's this node's amount.
51
+ # Assumes all names already have extracted their amounts with {{#transform_name}}.
52
+ def transform_contains(node = @node)
53
+ if !node.amount && node.contains.any?
54
+ if node.contains.first.name.nil? && node.contains.first.amount
55
+ node.amount = node.contains.shift.amount
56
+ elsif node.contains.count > 1 && node.contains.last.name.nil? && node.contains.last.amount
57
+ node.amount = node.contains.pop.amount
58
+ end
59
+ end
60
+
61
+ # recursively transform contained nodes
62
+ node.contains.each(&method(:transform_contains))
63
+ end
64
+
65
+ def parse_amount(s)
66
+ @parser.parse(s) if s && s.strip != ''
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,13 @@
1
+ module FoodIngredientParser::Loose::Transform
2
+ grammar AmountFromName
3
+ include FoodIngredientParser::Strict::Grammar::Common
4
+ include FoodIngredientParser::Strict::Grammar::Amount
5
+
6
+ rule amount_from_name
7
+ # just amount, amount in front or at the end
8
+ ws* amount:amount ws+ name:(.*) /
9
+ ws* amount:amount ws* /
10
+ ws* name:( !amount word ( ws+ !amount word )* )+ ws* amount:amount ws*
11
+ end
12
+ end
13
+ end
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Amount
3
3
  include Common
4
4
 
@@ -12,18 +12,19 @@ module FoodIngredientParser::Grammar
12
12
  rule simple_amount
13
13
  ( (
14
14
  'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
15
- 'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i
15
+ 'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i /
16
+ 'min.'i / 'min'i / 'max.'i / 'max'i
16
17
  ) ws* )?
17
18
  [±∓~∼∽≂≃≈≲≤<>≥≳]? ws*
18
19
  simple_amount_quantity
19
20
  ( ws+ (
20
- 'minimum'i /
21
- 'minimaal'i / 'minimum'i
21
+ 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
22
+ 'min.'i / 'min'i / 'max.'i / 'max'i
22
23
  ) )?
23
24
  end
24
25
 
25
26
  rule simple_amount_quantity
26
- number ( ws* '-' ws* number )? ws* ( '%' / 'g'i / 'mg'i / 'gram'i / 'ml'i )
27
+ number ( ws* '-' ws* number )? ws* ( [%٪⁒%﹪] / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'g'i ) !char ) )
27
28
  end
28
29
 
29
30
  end
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Common
3
3
 
4
4
  rule ws
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Ingredient
3
3
  include IngredientSimple
4
4
  include IngredientNested
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientColoned
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientNested
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientSimple
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar List
3
3
  include Common
4
4
  include Ingredient
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar ListColoned
3
3
  include Common
4
4
  include IngredientSimple
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar ListNewlined
3
3
  include Common
4
4
  include IngredientSimple
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Root
3
3
  include Common
4
4
  include List
@@ -0,0 +1,74 @@
1
+ require 'treetop/runtime'
2
+ require_relative 'to_html'
3
+
4
+ # Needs to be in grammar namespace so Treetop can find the nodes.
5
+ module FoodIngredientParser::Strict
6
+ module Grammar
7
+
8
+ # Treetop syntax node with our additions, use this as parent for all our own nodes.
9
+ class SyntaxNode < Treetop::Runtime::SyntaxNode
10
+ private
11
+
12
+ def to_a_deep(n, cls)
13
+ if n.is_a?(cls)
14
+ [n]
15
+ elsif n.nonterminal?
16
+ n.elements.map {|m| to_a_deep(m, cls) }.flatten(1).compact
17
+ end
18
+ end
19
+ end
20
+
21
+ # Root object, contains everything else.
22
+ class RootNode < SyntaxNode
23
+ include FoodIngredientParser::Strict::ToHtml
24
+
25
+ def to_h
26
+ h = { contains: contains.to_a }
27
+ if notes && notes_ary = to_a_deep(notes, NoteNode)&.map(&:text_value)
28
+ h[:notes] = notes_ary if notes_ary.length > 0
29
+ end
30
+ h
31
+ end
32
+ end
33
+
34
+ # List of ingredients.
35
+ class ListNode < SyntaxNode
36
+ def to_a
37
+ to_a_deep(contains, IngredientNode).map(&:to_h)
38
+ end
39
+ end
40
+
41
+ # Ingredient
42
+ class IngredientNode < SyntaxNode
43
+ def to_h
44
+ h = {}
45
+ h.merge!(to_a_deep(ing, IngredientNode)&.first&.to_h || {}) if respond_to?(:ing)
46
+ h.merge!(to_a_deep(amount, AmountNode)&.first&.to_h || {}) if respond_to?(:amount)
47
+ h[:name] = name.text_value if respond_to?(:name)
48
+ h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
49
+ h[:name] = h[:name] + post.text_value if respond_to?(:post)
50
+ h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
51
+ h
52
+ end
53
+ end
54
+
55
+ # Ingredient with containing ingredients.
56
+ class NestedIngredientNode < IngredientNode
57
+ def to_h
58
+ super.merge({ contains: to_a_deep(contains, IngredientNode).map(&:to_h) })
59
+ end
60
+ end
61
+
62
+ # Amount, specifying an ingredient.
63
+ class AmountNode < SyntaxNode
64
+ def to_h
65
+ { amount: amount.text_value }
66
+ end
67
+ end
68
+
69
+ # Note at the end of the ingredient list.
70
+ class NoteNode < SyntaxNode
71
+ end
72
+
73
+ end
74
+ end
@@ -1,6 +1,7 @@
1
1
  require_relative 'grammar'
2
+ require_relative '../cleaner'
2
3
 
3
- module FoodIngredientParser
4
+ module FoodIngredientParser::Strict
4
5
  class Parser
5
6
 
6
7
  # @!attribute [r] parser
@@ -20,22 +21,9 @@ module FoodIngredientParser
20
21
  # @return [FoodIngredientParser::Grammar::RootNode] structured representation of food ingredients
21
22
  # @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
22
23
  def parse(s, clean: true, **options)
23
- s = clean(s) if clean
24
+ s = FoodIngredientParser::Cleaner.clean(s) if clean
24
25
  @parser.parse(s, **options)
25
26
  end
26
27
 
27
- private
28
-
29
- def clean(s)
30
- s.gsub!("\u00ad", "") # strip soft hyphen
31
- s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
32
- s.gsub!("aÄs", "aïs") # encoding issue for maïs
33
- s.gsub!("ï", "ï") # encoding issue
34
- s.gsub!("ë", "ë") # encoding issue
35
- s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
36
- s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
37
- s
38
- end
39
-
40
28
  end
41
29
  end
@@ -0,0 +1,54 @@
1
+ require 'cgi'
2
+
3
+ # Adds HTML output functionality to a Treetop Node.
4
+ #
5
+ # The node needs to provide a {#to_h} method (for {#to_html_h}).
6
+ #
7
+ module FoodIngredientParser::Strict
8
+ module ToHtml
9
+
10
+ # Markup original ingredients list text in HTML.
11
+ #
12
+ # The input text is returned as HTML, augmented with CSS classes
13
+ # on +span+s for +name+, +amount+, +mark+ and +note+.
14
+ #
15
+ # @return [String] HTML representation of ingredient list.
16
+ def to_html
17
+ node_to_html(self)
18
+ end
19
+
20
+ private
21
+
22
+ def node_to_html(node, cls=nil, depth=0)
23
+ el_cls = {} # map of node instances to class names for contained elements
24
+ terminal = node.terminal? # whether to look at children elements or not
25
+
26
+ if node.is_a?(FoodIngredientParser::Strict::Grammar::AmountNode)
27
+ cls ||= "amount"
28
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::NoteNode)
29
+ cls ||= "note"
30
+ terminal = true # NoteNodes may contain other NoteNodes, we want it flat.
31
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::IngredientNode)
32
+ el_cls[node.name] = "name" if node.respond_to?(:name)
33
+ el_cls[node.mark] = "mark" if node.respond_to?(:mark)
34
+ if node.respond_to?(:contains)
35
+ el_cls[node.contains] = "contains depth#{depth}"
36
+ depth += 1
37
+ end
38
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::RootNode)
39
+ if node.respond_to?(:contains)
40
+ el_cls[node.contains] = "depth#{depth}"
41
+ depth += 1
42
+ end
43
+ end
44
+
45
+ val = if terminal
46
+ CGI.escapeHTML(node.text_value)
47
+ else
48
+ node.elements.map {|el| node_to_html(el, el_cls[el], depth) }.join("")
49
+ end
50
+
51
+ cls ? "<span class='#{cls}'>#{val}</span>" : val
52
+ end
53
+ end
54
+ end
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.0.0.pre.5'
3
- VERSION_DATE = '2018-09-07'
2
+ VERSION = '1.0.0.pre.6'
3
+ VERSION_DATE = '2018-09-17'
4
4
  end
@@ -1,2 +1,3 @@
1
1
  require_relative 'food_ingredient_parser/version'
2
- require_relative 'food_ingredient_parser/parser'
2
+ require_relative 'food_ingredient_parser/strict/parser'
3
+ require_relative 'food_ingredient_parser/loose/parser'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.5
4
+ version: 1.0.0.pre.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-07 00:00:00.000000000 Z
11
+ date: 2018-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -42,20 +42,26 @@ files:
42
42
  - bin/food_ingredient_parser
43
43
  - food_ingredient_parser.gemspec
44
44
  - lib/food_ingredient_parser.rb
45
- - lib/food_ingredient_parser/grammar.rb
46
- - lib/food_ingredient_parser/grammar/amount.treetop
47
- - lib/food_ingredient_parser/grammar/common.treetop
48
- - lib/food_ingredient_parser/grammar/ingredient.treetop
49
- - lib/food_ingredient_parser/grammar/ingredient_coloned.treetop
50
- - lib/food_ingredient_parser/grammar/ingredient_nested.treetop
51
- - lib/food_ingredient_parser/grammar/ingredient_simple.treetop
52
- - lib/food_ingredient_parser/grammar/list.treetop
53
- - lib/food_ingredient_parser/grammar/list_coloned.treetop
54
- - lib/food_ingredient_parser/grammar/list_newlined.treetop
55
- - lib/food_ingredient_parser/grammar/root.treetop
56
- - lib/food_ingredient_parser/nodes.rb
57
- - lib/food_ingredient_parser/parser.rb
58
- - lib/food_ingredient_parser/to_html.rb
45
+ - lib/food_ingredient_parser/cleaner.rb
46
+ - lib/food_ingredient_parser/loose/node.rb
47
+ - lib/food_ingredient_parser/loose/parser.rb
48
+ - lib/food_ingredient_parser/loose/scanner.rb
49
+ - lib/food_ingredient_parser/loose/transform/amount.rb
50
+ - lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
51
+ - lib/food_ingredient_parser/strict/grammar.rb
52
+ - lib/food_ingredient_parser/strict/grammar/amount.treetop
53
+ - lib/food_ingredient_parser/strict/grammar/common.treetop
54
+ - lib/food_ingredient_parser/strict/grammar/ingredient.treetop
55
+ - lib/food_ingredient_parser/strict/grammar/ingredient_coloned.treetop
56
+ - lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop
57
+ - lib/food_ingredient_parser/strict/grammar/ingredient_simple.treetop
58
+ - lib/food_ingredient_parser/strict/grammar/list.treetop
59
+ - lib/food_ingredient_parser/strict/grammar/list_coloned.treetop
60
+ - lib/food_ingredient_parser/strict/grammar/list_newlined.treetop
61
+ - lib/food_ingredient_parser/strict/grammar/root.treetop
62
+ - lib/food_ingredient_parser/strict/nodes.rb
63
+ - lib/food_ingredient_parser/strict/parser.rb
64
+ - lib/food_ingredient_parser/strict/to_html.rb
59
65
  - lib/food_ingredient_parser/version.rb
60
66
  homepage: https://github.com/q-m/food-ingredient-parser-ruby
61
67
  licenses:
@@ -1,72 +0,0 @@
1
- require 'treetop/runtime'
2
- require_relative 'to_html'
3
-
4
- # Needs to be in grammar namespace so Treetop can find the nodes.
5
- module FoodIngredientParser::Grammar
6
-
7
- # Treetop syntax node with our additions, use this as parent for all our own nodes.
8
- class SyntaxNode < Treetop::Runtime::SyntaxNode
9
- private
10
-
11
- def to_a_deep(n, cls)
12
- if n.is_a?(cls)
13
- [n]
14
- elsif n.nonterminal?
15
- n.elements.map {|m| to_a_deep(m, cls) }.flatten(1).compact
16
- end
17
- end
18
- end
19
-
20
- # Root object, contains everything else.
21
- class RootNode < SyntaxNode
22
- include FoodIngredientParser::ToHtml
23
-
24
- def to_h
25
- h = { contains: contains.to_a }
26
- if notes && notes_ary = to_a_deep(notes, NoteNode)&.map(&:text_value)
27
- h[:notes] = notes_ary if notes_ary.length > 0
28
- end
29
- h
30
- end
31
- end
32
-
33
- # List of ingredients.
34
- class ListNode < SyntaxNode
35
- def to_a
36
- to_a_deep(contains, IngredientNode).map(&:to_h)
37
- end
38
- end
39
-
40
- # Ingredient
41
- class IngredientNode < SyntaxNode
42
- def to_h
43
- h = {}
44
- h.merge!(to_a_deep(ing, IngredientNode)&.first&.to_h || {}) if respond_to?(:ing)
45
- h.merge!(to_a_deep(amount, AmountNode)&.first&.to_h || {}) if respond_to?(:amount)
46
- h[:name] = name.text_value if respond_to?(:name)
47
- h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
48
- h[:name] = h[:name] + post.text_value if respond_to?(:post)
49
- h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
50
- h
51
- end
52
- end
53
-
54
- # Ingredient with containing ingredients.
55
- class NestedIngredientNode < IngredientNode
56
- def to_h
57
- super.merge({ contains: to_a_deep(contains, IngredientNode).map(&:to_h) })
58
- end
59
- end
60
-
61
- # Amount, specifying an ingredient.
62
- class AmountNode < SyntaxNode
63
- def to_h
64
- { amount: amount.text_value }
65
- end
66
- end
67
-
68
- # Note at the end of the ingredient list.
69
- class NoteNode < SyntaxNode
70
- end
71
-
72
- end
@@ -1,52 +0,0 @@
1
- require 'cgi'
2
-
3
- # Adds HTML output functionality to a Treetop Node.
4
- #
5
- # The node needs to provide a {#to_h} method (for {#to_html_h}).
6
- #
7
- module FoodIngredientParser::ToHtml
8
-
9
- # Markup original ingredients list text in HTML.
10
- #
11
- # The input text is returned as HTML, augmented with CSS classes
12
- # on +span+s for +name+, +amount+, +mark+ and +note+.
13
- #
14
- # @return [String] HTML representation of ingredient list.
15
- def to_html
16
- node_to_html(self)
17
- end
18
-
19
- private
20
-
21
- def node_to_html(node, cls=nil, depth=0)
22
- el_cls = {} # map of node instances to class names for contained elements
23
- terminal = node.terminal? # whether to look at children elements or not
24
-
25
- if node.is_a?(FoodIngredientParser::Grammar::AmountNode)
26
- cls ||= "amount"
27
- elsif node.is_a?(FoodIngredientParser::Grammar::NoteNode)
28
- cls ||= "note"
29
- terminal = true # NoteNodes may contain other NoteNodes, we want it flat.
30
- elsif node.is_a?(FoodIngredientParser::Grammar::IngredientNode)
31
- el_cls[node.name] = "name" if node.respond_to?(:name)
32
- el_cls[node.mark] = "mark" if node.respond_to?(:mark)
33
- if node.respond_to?(:contains)
34
- el_cls[node.contains] = "contains depth#{depth}"
35
- depth += 1
36
- end
37
- elsif node.is_a?(FoodIngredientParser::Grammar::RootNode)
38
- if node.respond_to?(:contains)
39
- el_cls[node.contains] = "depth#{depth}"
40
- depth += 1
41
- end
42
- end
43
-
44
- val = if terminal
45
- CGI.escapeHTML(node.text_value)
46
- else
47
- node.elements.map {|el| node_to_html(el, el_cls[el], depth) }.join("")
48
- end
49
-
50
- cls ? "<span class='#{cls}'>#{val}</span>" : val
51
- end
52
- end