food_ingredient_parser 1.0.0.pre.5 → 1.0.0.pre.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +44 -8
  3. data/bin/food_ingredient_parser +13 -5
  4. data/lib/food_ingredient_parser/cleaner.rb +16 -0
  5. data/lib/food_ingredient_parser/loose/node.rb +60 -0
  6. data/lib/food_ingredient_parser/loose/parser.rb +24 -0
  7. data/lib/food_ingredient_parser/loose/scanner.rb +191 -0
  8. data/lib/food_ingredient_parser/loose/transform/amount.rb +70 -0
  9. data/lib/food_ingredient_parser/loose/transform/amount_from_name.treetop +13 -0
  10. data/lib/food_ingredient_parser/{grammar → strict/grammar}/amount.treetop +6 -5
  11. data/lib/food_ingredient_parser/{grammar → strict/grammar}/common.treetop +1 -1
  12. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient.treetop +1 -1
  13. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_coloned.treetop +1 -1
  14. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_nested.treetop +1 -1
  15. data/lib/food_ingredient_parser/{grammar → strict/grammar}/ingredient_simple.treetop +1 -1
  16. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list.treetop +1 -1
  17. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list_coloned.treetop +1 -1
  18. data/lib/food_ingredient_parser/{grammar → strict/grammar}/list_newlined.treetop +1 -1
  19. data/lib/food_ingredient_parser/{grammar → strict/grammar}/root.treetop +1 -1
  20. data/lib/food_ingredient_parser/strict/nodes.rb +74 -0
  21. data/lib/food_ingredient_parser/{parser.rb → strict/parser.rb} +3 -15
  22. data/lib/food_ingredient_parser/strict/to_html.rb +54 -0
  23. data/lib/food_ingredient_parser/version.rb +2 -2
  24. data/lib/food_ingredient_parser.rb +2 -1
  25. metadata +22 -16
  26. data/lib/food_ingredient_parser/nodes.rb +0 -72
  27. data/lib/food_ingredient_parser/to_html.rb +0 -52
  28. /data/lib/food_ingredient_parser/{grammar.rb → strict/grammar.rb} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8725e4ed3763020de6b46cad6709ce05aca0b77f
4
- data.tar.gz: fedb82af99346e8db38a3beaca5914ab44be197a
3
+ metadata.gz: 54bdb9187f9a2dfbec67737ddc2a3ad90f4ca058
4
+ data.tar.gz: fcfc99674e0f58801ca3a375acebe91ba3f80c84
5
5
  SHA512:
6
- metadata.gz: 114feb403f87140f2eccc21860c9a75aff4cff3583f3046f9815299f90dc6374c73b3b9ee60d682c3a9f91b2803ab3fc6c934f3255716986a494fbb51c9e5564
7
- data.tar.gz: 8f94825d627ab8068fb3dfd6c20f61a20fc8612f06ec6a760d33e46def103ca11497e364d9e3a7c4720647794e644a7253ba6e226d984801a55e089a5961b753
6
+ metadata.gz: 773526e862a74f04614486f3542de0c89f8663e10f959978a7d3a2e1ba8703e8a9ae93bde6b043a62326be68956733a30e6c01d85b6578dba0abc9064590fb18
7
+ data.tar.gz: e3296ae3222745f20727eed70bd1a9dac84a1714f1bdc64c2cd524bfbd6589a470afea33eab1e4ce717dbec9f4b3b3d8d9350025d13a4046e78cea3d0aea9b6d
data/README.md CHANGED
@@ -22,11 +22,11 @@ require 'food_ingredient_parser'
22
22
  s = "Water* 60%, suiker 30%, voedingszuren: citroenzuur, appelzuur, zuurteregelaar: E576/E577, " \
23
23
  + "natuurlijke citroen-limoen aroma's 0,2%, zoetstof: steviolglycosiden, * = Biologisch. " \
24
24
  + "E = door de E.U. goedgekeurde toevoeging."
25
- parser = FoodIngredientParser::Parser.new
25
+ parser = FoodIngredientParser::Strict::Parser.new
26
26
  puts parser.parse(s).to_h.inspect
27
27
  ```
28
28
  Results in
29
- ```
29
+ ```ruby
30
30
  {
31
31
  :contains=>[
32
32
  {:name=>"Water", :amount=>"60%", :mark=>"*"},
@@ -58,14 +58,15 @@ running this from the source tree, use `bin/food_ingredient_parser` instead.
58
58
 
59
59
  ```
60
60
  $ food_ingredient_parser -h
61
- Usage: food_ingredient_parser [options] --file|-f <filename>
62
- food_ingredient_parser [options] --string|-s <ingredients>
61
+ Usage: bin/food_ingredient_parser [options] --file|-f <filename>
62
+ bin/food_ingredient_parser [options] --string|-s <ingredients>
63
63
 
64
64
  -f, --file FILE Parse all lines of the file as ingredient lists.
65
65
  -s, --string INGREDIENTS Parse specified ingredient list.
66
66
  -q, --[no-]quiet Only show summary.
67
67
  -p, --parsed Only show lines that were successfully parsed.
68
- -e, --escape Escape newlines
68
+ -r, --parser PARSER Use specific parser (strict, loose).
69
+ -e, --[no-]escape Escape newlines
69
70
  -c, --[no-]color Use color
70
71
  -n, --noresult Only show lines that had no result.
71
72
  -v, --[no-]verbose Show more data (parsed tree).
@@ -102,6 +103,12 @@ RootNode+Root3 offset=0, "tomato" (contains,notes):
102
103
  SyntaxNode offset=6, ""
103
104
  {:contains=>[{:name=>"tomato"}]}
104
105
 
106
+ $ food_ingredient_parser -v -r loose -s "tomato"
107
+ "tomato"
108
+ Node interval=0..5
109
+ Node interval=0..5, name="tomato"
110
+ {:contains=>[{:name=>"tomato"}]}
111
+
105
112
  $ food_ingredient_parser -q -f data/test-cases
106
113
  parsed 35 (100.0%), no result 0 (0.0%)
107
114
  ```
@@ -114,12 +121,12 @@ When ingredient lists are entered manually, it can be very useful to show how th
114
121
  recognized. This can help understanding why a certain ingredients list cannot be parsed.
115
122
 
116
123
  For this you can use the `to_html` method on the parsed output, which returns the original
117
- text, augmented with CSS classes for different parts.
124
+ text, augmented with CSS classes for different parts. (Available for strict parser only.)
118
125
 
119
126
  ```ruby
120
127
  require 'food_ingredient_parser'
121
128
 
122
- parsed = FoodIngredientParser::Parser.new.parse("Saus (10% tomaat*, zout). * = bio")
129
+ parsed = FoodIngredientParser::Strict::Parser.new.parse("Saus (10% tomaat*, zout). * = bio")
123
130
  puts parsed.to_html
124
131
  ```
125
132
 
@@ -138,9 +145,38 @@ For an example of an interactive editor, see [examples/editor.rb](examples/edito
138
145
 
139
146
  ![editor example screenshot](examples/editor-screenshot.png)
140
147
 
148
+ ## Loose parser
149
+
150
+ The strict parser only parses ingredient lists that conform to one of the many different
151
+ formats expected. If you'd like to return a result always, even if that is not necessarily
152
+ completely correct, you can use the _loose_ parser. This does not use Treetop, but looks
153
+ at the input character for character and tries to make the best of it. Nevertheless, if you
154
+ just want to have _some_ result, this can still be very useful.
155
+
156
+ ```ruby
157
+ require 'food_ingredient_parser'
158
+
159
+ parsed = FoodIngredientParser::Loose::Parser.new.parse("Saus [10% tomaat*, (zout); peper.")
160
+ puts parsed.to_h
161
+ ```
162
+
163
+ Even though the strict parser would not give a result, the loose parser returns:
164
+ ```ruby
165
+ {
166
+ :contains=>[
167
+ {:name=>"Saus", :contains=>[
168
+ {:name=>"tomaat", :mark=>"*", :amount=>"10%"},
169
+ {:contains=>[{:name=>"zout"}]},
170
+ {:name=>"peper"}
171
+ ]}
172
+ ]
173
+ }
174
+ ```
175
+
141
176
  ## Test data
142
177
 
143
178
  [`data/ingredient-samples-nl`](data/ingredient-samples-nl) contains about 150k
144
179
  real-world ingredient lists found on the Dutch market. Each line contains one ingredient
145
180
  list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
146
- Currently almost three quarter is recognized and parsed. We aim to reach at least 90%.
181
+ The strict parser currently parses about three quarter, while the loose parser returns
182
+ something for all of them.
@@ -31,8 +31,7 @@ def colorize(color, s)
31
31
  end
32
32
  end
33
33
 
34
- def parse_single(s, parsed=nil, parser: nil, verbosity: 1, print: nil, escape: false, color: false)
35
- parser ||= FoodIngredientParser::Parser.new
34
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
36
35
  parsed ||= parser.parse(s)
37
36
 
38
37
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
@@ -47,7 +46,7 @@ def parse_single(s, parsed=nil, parser: nil, verbosity: 1, print: nil, escape: f
47
46
  end
48
47
  end
49
48
 
50
- def parse_file(path, parser: nil, verbosity: 1, print: nil, escape: false, color: false)
49
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
51
50
  count_parsed = count_noresult = 0
52
51
  File.foreach(path) do |line|
53
52
  next if line =~ /^#/ # comment
@@ -70,8 +69,13 @@ verbosity = 1
70
69
  files = []
71
70
  strings = []
72
71
  print = nil
72
+ parser_name = :strict
73
73
  escape = false
74
74
  color = true
75
+ PARSERS = {
76
+ strict: FoodIngredientParser::Strict::Parser,
77
+ loose: FoodIngredientParser::Loose::Parser
78
+ }
75
79
  OptionParser.new do |opts|
76
80
  opts.banner = <<-EOF.gsub(/^ /, '')
77
81
  Usage: #{$0} [options] --file|-f <filename>
@@ -84,7 +88,8 @@ OptionParser.new do |opts|
84
88
 
85
89
  opts.on("-q", "--[no-]quiet", "Only show summary.") {|q| verbosity = q ? 0 : 1 }
86
90
  opts.on("-p", "--parsed", "Only show lines that were successfully parsed.") {|p| print = :parsed }
87
- opts.on("-e", "--escape", "Escape newlines") {|e| escape = true }
91
+ opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
92
+ opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
88
93
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
89
94
  opts.on("-n", "--noresult", "Only show lines that had no result.") {|p| print = :noresult }
90
95
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
@@ -99,7 +104,10 @@ OptionParser.new do |opts|
99
104
  end.parse!
100
105
 
101
106
  if strings.any? || files.any?
102
- parser = FoodIngredientParser::Parser.new
107
+ unless parser = PARSERS[parser_name]&.new
108
+ STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
109
+ exit(1)
110
+ end
103
111
  strings.each {|s| parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
104
112
  files.each {|f| parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
105
113
  else
@@ -0,0 +1,16 @@
1
+ module FoodIngredientParser
2
+ module Cleaner
3
+
4
+ def self.clean(s)
5
+ s.gsub!("\u00ad", "") # strip soft hyphen
6
+ s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
7
+ s.gsub!("aÄs", "aïs") # encoding issue for maïs
8
+ s.gsub!("ï", "ï") # encoding issue
9
+ s.gsub!("ë", "ë") # encoding issue
10
+ s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
11
+ s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
12
+ s
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,60 @@
1
+ module FoodIngredientParser::Loose
2
+ # Parsing result.
3
+ class Node
4
+ attr_accessor :name, :mark, :amount, :contains, :notes
5
+ attr_reader :input, :interval, :auto_close
6
+
7
+ def initialize(input, interval, auto_close: false)
8
+ @input = input
9
+ @interval = interval.is_a?(Range) ? interval : ( interval .. interval )
10
+ @auto_close = auto_close
11
+ @contains = []
12
+ @notes = []
13
+ @name = @mark = @amount = nil
14
+ end
15
+
16
+ def ends(index)
17
+ @interval = @interval.first .. index
18
+ end
19
+
20
+ def <<(child)
21
+ @contains << child
22
+ end
23
+
24
+ def text_value
25
+ @input[@interval]
26
+ end
27
+
28
+ def to_h
29
+ r = {}
30
+ r[:name] = name.text_value.strip if name && name.text_value.strip != ''
31
+ r[:mark] = mark.text_value.strip if mark
32
+ r[:amount] = amount.text_value.strip if amount
33
+ r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
34
+ r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
35
+ r
36
+ end
37
+
38
+ def inspect(indent="", variant="")
39
+ inspect_self(indent, variant) +
40
+ inspect_children(indent)
41
+ end
42
+
43
+ def inspect_self(indent="", variant="")
44
+ [
45
+ indent + "Node#{variant} interval=#{@interval}",
46
+ name ? "name=#{name.text_value.strip.inspect}" : nil,
47
+ mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
48
+ amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
49
+ auto_close ? "auto_close" : nil
50
+ ].compact.join(", ")
51
+ end
52
+
53
+ def inspect_children(indent="")
54
+ [
55
+ *contains.map {|child| "\n" + child.inspect(indent + " ") },
56
+ *notes.map {|note| "\n" + note.inspect(indent + " ", "(note)") }
57
+ ].join("")
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,24 @@
1
+ require_relative '../cleaner'
2
+ require_relative 'scanner'
3
+ require_relative 'transform/amount'
4
+
5
+ module FoodIngredientParser::Loose
6
+ class Parser
7
+
8
+ # Create a new food ingredient stream parser
9
+ # @return [FoodIngredientParser::StreamParser]
10
+ def initialize
11
+ end
12
+
13
+ # Parse food ingredient list text into a structured representation.
14
+ #
15
+ # @option clean [Boolean] pass +false+ to disable correcting frequently occuring issues
16
+ # @return [FoodIngredientParser::Loose::Node] structured representation of food ingredients
17
+ def parse(s, clean: true, **options)
18
+ s = FoodIngredientParser::Cleaner.clean(s) if clean
19
+ n = Scanner.new(s).scan
20
+ n = Transform::Amount.transform!(n) if n
21
+ n
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,191 @@
1
+ require_relative 'node'
2
+
3
+ module FoodIngredientParser::Loose
4
+ class Scanner
5
+
6
+ SEP_CHARS = "|;,.".freeze
7
+ MARK_CHARS = "¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº⁽⁾†‡•°#^*".freeze
8
+ PREFIX_RE = /\A\s*(ingredients|contains|ingred[iï][eë]nt(en)?(declaratie)?|bevat|dit zit er\s?in|samenstelling|zutaten)\s*[:;.]\s*/i.freeze
9
+ NOTE_RE = /\A\b(dit product kan\b|kan sporen\b.*?\bbevatten\b|voor allergenen\b|allergenen\b|E\s*=|gemaakt in\b|geproduceerd in\b|bevat mogelijk\b|kijk voor meer\b|allergie-info|in de fabriek\b|in dit bedrijf\b)/i.freeze
10
+
11
+ def initialize(s, index: 0)
12
+ @s = s # input string
13
+ @i = index # current index in string
14
+ @cur = nil # current node we're populating
15
+ @ancestors = [Node.new(@s, @i)] # nesting hierarchy
16
+ @iterator = :beginning # scan_iteration_<iterator> to use for parsing
17
+ @dest = :contains # append current node to this attribute on parent
18
+ end
19
+
20
+ def scan
21
+ loop do
22
+ method(:"scan_iteration_#{@iterator}").call
23
+ end
24
+
25
+ close_all_ancestors
26
+ @ancestors.first.ends(@i-1)
27
+ @ancestors.first
28
+ end
29
+
30
+ private
31
+
32
+ def loop
33
+ while @i < @s.length
34
+ @i += 1 if yield != false
35
+ end
36
+ end
37
+
38
+ def scan_iteration_beginning
39
+ # skip over some common prefixes
40
+ m = @s[@i .. -1].match(PREFIX_RE)
41
+ @i += m.offset(0).last if m
42
+ # now continue with the standard parsing
43
+ @iterator = :standard
44
+ false
45
+ end
46
+
47
+ def scan_iteration_standard
48
+ if "([".include?(c) # open nesting
49
+ open_parent
50
+ elsif ")]".include?(c) # close nesting
51
+ add_child
52
+ close_parent
53
+ elsif is_notes_start? # usually a dot marks the start of notes
54
+ close_all_ancestors
55
+ @iterator = :notes
56
+ @dest = :notes
57
+ elsif is_sep? # separator
58
+ add_child
59
+ elsif ":".include?(c) # another open nesting
60
+ add_child
61
+ open_parent(auto_close: true)
62
+ @iterator = :colon
63
+ elsif is_mark? && !cur.mark # mark after ingredient
64
+ name_until_here
65
+ len = mark_len
66
+ cur.mark = Node.new(@s, @i .. @i+len-1)
67
+ @i += len - 1
68
+ else
69
+ cur # reference to record starting position
70
+ end
71
+ end
72
+
73
+ def scan_iteration_colon
74
+ if "/".include?(c) # slash separator in colon nesting only
75
+ add_child
76
+ elsif is_sep? # regular separator indicates end of colon nesting
77
+ add_child
78
+ close_parent
79
+ # revert to standard parsing from here on
80
+ @iterator = :standard
81
+ scan_iteration_standard
82
+ elsif "([]):".include?(c) # continue with deeper nesting level
83
+ # revert to standard parsing from here on
84
+ @iterator = :standard
85
+ scan_iteration_standard
86
+ else
87
+ # normal handling for this character
88
+ scan_iteration_standard
89
+ end
90
+ end
91
+
92
+ def scan_iteration_notes
93
+ if is_sep?(chars: ".") # dot means new note
94
+ add_child
95
+ else
96
+ cur
97
+ end
98
+ end
99
+
100
+ def c
101
+ @s[@i]
102
+ end
103
+
104
+ def parent
105
+ @ancestors.last
106
+ end
107
+
108
+ def cur
109
+ @cur ||= Node.new(@s, @i)
110
+ end
111
+
112
+ def is_mark?
113
+ mark_len > 0 && @s[@i..@i+1] !~ /\A°[CF]/
114
+ end
115
+
116
+ def is_sep?(chars: SEP_CHARS)
117
+ chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
118
+ end
119
+
120
+ def mark_len
121
+ i = @i
122
+ while @s[i] && MARK_CHARS.include?(@s[i])
123
+ i += 1
124
+ end
125
+ i - @i
126
+ end
127
+
128
+ def is_notes_start?
129
+ # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
130
+ if ( is_mark? && @s[@i+mark_len..-1] =~ /\A\s*=/ ) || # "* = Biologisch"
131
+ ( is_mark? && @s[@i-2..@i-1] =~ /\A\s\s/ ) || # " **Biologisch"
132
+ ( @s[@i..-1] =~ NOTE_RE ) # "E=", "Kan sporen van", ...
133
+ @i -= 1 # we want to include the mark in the note
134
+ true
135
+ # End of sentence
136
+ elsif dot_is_not_sep? && is_sep?(chars: ".")
137
+ true
138
+ else
139
+ false
140
+ end
141
+ end
142
+
143
+ def add_child
144
+ cur.ends(@i-1)
145
+ cur.name ||= Node.new(@s, cur.interval)
146
+ parent.send(@dest) << cur
147
+ @cur = nil
148
+ end
149
+
150
+ def open_parent(**options)
151
+ name_until_here
152
+ @ancestors << cur
153
+ @cur = Node.new(@s, @i + 1, **options)
154
+ end
155
+
156
+ def close_parent
157
+ return unless @ancestors.count > 1
158
+ @cur = @ancestors.pop
159
+ while @cur.auto_close
160
+ add_child
161
+ @cur = @ancestors.pop
162
+ end
163
+ end
164
+
165
+ def close_all_ancestors
166
+ while @ancestors.count > 1
167
+ add_child
168
+ close_parent
169
+ end
170
+ add_child
171
+ end
172
+
173
+ def name_until_here
174
+ cur.name ||= Node.new(@s, cur.interval.first .. @i-1)
175
+ end
176
+
177
+ def dot_is_not_sep?
178
+ # if separator is dot ".", don't use it for note detection
179
+ if @dot_is_not_sep.nil?
180
+ @dot_is_not_sep = begin
181
+ # @todo if another separator is found more often, dot is not a separator
182
+ num_words = @s.split(/\s+/).count
183
+ num_dots = @s.count(".")
184
+ # heuristic: 1/4+ of the words has a dot, with at least five words
185
+ num_words < 5 || 4 * num_dots < num_words
186
+ end
187
+ end
188
+ @dot_is_not_sep
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,70 @@
1
+ require 'treetop'
2
+ require_relative '../../strict/nodes'
3
+ Treetop.load File.dirname(__FILE__) + '/../../strict/grammar/common'
4
+ Treetop.load File.dirname(__FILE__) + '/../../strict/grammar/amount'
5
+ Treetop.load File.dirname(__FILE__) + '/amount_from_name'
6
+
7
+ require_relative '../node'
8
+
9
+ module FoodIngredientParser::Loose
10
+ module Transform
11
+ # Transforms node tree to extract amount into its own attribute.
12
+ class Amount
13
+ def self.transform!(node)
14
+ new(node).transform!
15
+ end
16
+
17
+ def initialize(node)
18
+ @node = node
19
+ @parser = FoodIngredientParser::Loose::Transform::AmountFromNameParser.new
20
+ end
21
+
22
+ def transform!
23
+ transform_name
24
+ transform_contains
25
+ @node
26
+ end
27
+
28
+ private
29
+
30
+ # Extract amount from name, if any.
31
+ def transform_name(node = @node)
32
+ if !node.amount && parsed = parse_amount(node.name&.text_value)
33
+ offset = node.name.interval.first
34
+
35
+ amount = parsed.amount.amount
36
+ node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
37
+
38
+ name = parsed.respond_to?(:name) && parsed.name
39
+ if name && name.interval.count > 0
40
+ node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
41
+ else
42
+ node.name = nil
43
+ end
44
+ end
45
+
46
+ # recursively transform contained nodes
47
+ node.contains&.each(&method(:transform_name))
48
+ end
49
+
50
+ # If first or last child is an amount, it's this node's amount.
51
+ # Assumes all names already have extracted their amounts with {{#transform_name}}.
52
+ def transform_contains(node = @node)
53
+ if !node.amount && node.contains.any?
54
+ if node.contains.first.name.nil? && node.contains.first.amount
55
+ node.amount = node.contains.shift.amount
56
+ elsif node.contains.count > 1 && node.contains.last.name.nil? && node.contains.last.amount
57
+ node.amount = node.contains.pop.amount
58
+ end
59
+ end
60
+
61
+ # recursively transform contained nodes
62
+ node.contains.each(&method(:transform_contains))
63
+ end
64
+
65
+ def parse_amount(s)
66
+ @parser.parse(s) if s && s.strip != ''
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,13 @@
1
+ module FoodIngredientParser::Loose::Transform
2
+ grammar AmountFromName
3
+ include FoodIngredientParser::Strict::Grammar::Common
4
+ include FoodIngredientParser::Strict::Grammar::Amount
5
+
6
+ rule amount_from_name
7
+ # just amount, amount in front or at the end
8
+ ws* amount:amount ws+ name:(.*) /
9
+ ws* amount:amount ws* /
10
+ ws* name:( !amount word ( ws+ !amount word )* )+ ws* amount:amount ws*
11
+ end
12
+ end
13
+ end
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Amount
3
3
  include Common
4
4
 
@@ -12,18 +12,19 @@ module FoodIngredientParser::Grammar
12
12
  rule simple_amount
13
13
  ( (
14
14
  'of which'i / 'at least'i / 'minimal'i / 'maximal'i / 'less than'i / 'more than'i /
15
- 'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i
15
+ 'waarvan'i / 'ten minste'i / 'tenminste'i / 'minimaal'i / 'maximaal'i / 'minder dan'i / 'meer dan'i /
16
+ 'min.'i / 'min'i / 'max.'i / 'max'i
16
17
  ) ws* )?
17
18
  [±∓~∼∽≂≃≈≲≤<>≥≳]? ws*
18
19
  simple_amount_quantity
19
20
  ( ws+ (
20
- 'minimum'i /
21
- 'minimaal'i / 'minimum'i
21
+ 'minimaal'i / 'minimum'i / 'van het uitlekgewicht'i / 'van het geheel'i /
22
+ 'min.'i / 'min'i / 'max.'i / 'max'i
22
23
  ) )?
23
24
  end
24
25
 
25
26
  rule simple_amount_quantity
26
- number ( ws* '-' ws* number )? ws* ( '%' / 'g'i / 'mg'i / 'gram'i / 'ml'i )
27
+ number ( ws* '-' ws* number )? ws* ( [%٪⁒%﹪] / ( ( 'procent' / 'percent' / 'gram'i / 'ml'i / 'mg'i / 'g'i ) !char ) )
27
28
  end
28
29
 
29
30
  end
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Common
3
3
 
4
4
  rule ws
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Ingredient
3
3
  include IngredientSimple
4
4
  include IngredientNested
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientColoned
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientNested
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar IngredientSimple
3
3
  include Common
4
4
  include Amount
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar List
3
3
  include Common
4
4
  include Ingredient
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar ListColoned
3
3
  include Common
4
4
  include IngredientSimple
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar ListNewlined
3
3
  include Common
4
4
  include IngredientSimple
@@ -1,4 +1,4 @@
1
- module FoodIngredientParser::Grammar
1
+ module FoodIngredientParser::Strict::Grammar
2
2
  grammar Root
3
3
  include Common
4
4
  include List
@@ -0,0 +1,74 @@
1
+ require 'treetop/runtime'
2
+ require_relative 'to_html'
3
+
4
+ # Needs to be in grammar namespace so Treetop can find the nodes.
5
+ module FoodIngredientParser::Strict
6
+ module Grammar
7
+
8
+ # Treetop syntax node with our additions, use this as parent for all our own nodes.
9
+ class SyntaxNode < Treetop::Runtime::SyntaxNode
10
+ private
11
+
12
+ def to_a_deep(n, cls)
13
+ if n.is_a?(cls)
14
+ [n]
15
+ elsif n.nonterminal?
16
+ n.elements.map {|m| to_a_deep(m, cls) }.flatten(1).compact
17
+ end
18
+ end
19
+ end
20
+
21
+ # Root object, contains everything else.
22
+ class RootNode < SyntaxNode
23
+ include FoodIngredientParser::Strict::ToHtml
24
+
25
+ def to_h
26
+ h = { contains: contains.to_a }
27
+ if notes && notes_ary = to_a_deep(notes, NoteNode)&.map(&:text_value)
28
+ h[:notes] = notes_ary if notes_ary.length > 0
29
+ end
30
+ h
31
+ end
32
+ end
33
+
34
+ # List of ingredients.
35
+ class ListNode < SyntaxNode
36
+ def to_a
37
+ to_a_deep(contains, IngredientNode).map(&:to_h)
38
+ end
39
+ end
40
+
41
+ # Ingredient
42
+ class IngredientNode < SyntaxNode
43
+ def to_h
44
+ h = {}
45
+ h.merge!(to_a_deep(ing, IngredientNode)&.first&.to_h || {}) if respond_to?(:ing)
46
+ h.merge!(to_a_deep(amount, AmountNode)&.first&.to_h || {}) if respond_to?(:amount)
47
+ h[:name] = name.text_value if respond_to?(:name)
48
+ h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
49
+ h[:name] = h[:name] + post.text_value if respond_to?(:post)
50
+ h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
51
+ h
52
+ end
53
+ end
54
+
55
+ # Ingredient with containing ingredients.
56
+ class NestedIngredientNode < IngredientNode
57
+ def to_h
58
+ super.merge({ contains: to_a_deep(contains, IngredientNode).map(&:to_h) })
59
+ end
60
+ end
61
+
62
+ # Amount, specifying an ingredient.
63
+ class AmountNode < SyntaxNode
64
+ def to_h
65
+ { amount: amount.text_value }
66
+ end
67
+ end
68
+
69
+ # Note at the end of the ingredient list.
70
+ class NoteNode < SyntaxNode
71
+ end
72
+
73
+ end
74
+ end
@@ -1,6 +1,7 @@
1
1
  require_relative 'grammar'
2
+ require_relative '../cleaner'
2
3
 
3
- module FoodIngredientParser
4
+ module FoodIngredientParser::Strict
4
5
  class Parser
5
6
 
6
7
  # @!attribute [r] parser
@@ -20,22 +21,9 @@ module FoodIngredientParser
20
21
  # @return [FoodIngredientParser::Grammar::RootNode] structured representation of food ingredients
21
22
  # @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
22
23
  def parse(s, clean: true, **options)
23
- s = clean(s) if clean
24
+ s = FoodIngredientParser::Cleaner.clean(s) if clean
24
25
  @parser.parse(s, **options)
25
26
  end
26
27
 
27
- private
28
-
29
- def clean(s)
30
- s.gsub!("\u00ad", "") # strip soft hyphen
31
- s.gsub!("\u0092", "'") # windows-1252 apostrophe - https://stackoverflow.com/a/15564279/2866660
32
- s.gsub!("aÄs", "aïs") # encoding issue for maïs
33
- s.gsub!("ï", "ï") # encoding issue
34
- s.gsub!("ë", "ë") # encoding issue
35
- s.gsub!(/\A\s*"(.*)"\s*\z/, '\1') # enclosing double quotation marks
36
- s.gsub!(/\A\s*'(.*)'\s*\z/, '\1') # enclosing single quotation marks
37
- s
38
- end
39
-
40
28
  end
41
29
  end
@@ -0,0 +1,54 @@
1
+ require 'cgi'
2
+
3
+ # Adds HTML output functionality to a Treetop Node.
4
+ #
5
+ # The node needs to provide a {#to_h} method (for {#to_html_h}).
6
+ #
7
+ module FoodIngredientParser::Strict
8
+ module ToHtml
9
+
10
+ # Markup original ingredients list text in HTML.
11
+ #
12
+ # The input text is returned as HTML, augmented with CSS classes
13
+ # on +span+s for +name+, +amount+, +mark+ and +note+.
14
+ #
15
+ # @return [String] HTML representation of ingredient list.
16
+ def to_html
17
+ node_to_html(self)
18
+ end
19
+
20
+ private
21
+
22
+ def node_to_html(node, cls=nil, depth=0)
23
+ el_cls = {} # map of node instances to class names for contained elements
24
+ terminal = node.terminal? # whether to look at children elements or not
25
+
26
+ if node.is_a?(FoodIngredientParser::Strict::Grammar::AmountNode)
27
+ cls ||= "amount"
28
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::NoteNode)
29
+ cls ||= "note"
30
+ terminal = true # NoteNodes may contain other NoteNodes, we want it flat.
31
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::IngredientNode)
32
+ el_cls[node.name] = "name" if node.respond_to?(:name)
33
+ el_cls[node.mark] = "mark" if node.respond_to?(:mark)
34
+ if node.respond_to?(:contains)
35
+ el_cls[node.contains] = "contains depth#{depth}"
36
+ depth += 1
37
+ end
38
+ elsif node.is_a?(FoodIngredientParser::Strict::Grammar::RootNode)
39
+ if node.respond_to?(:contains)
40
+ el_cls[node.contains] = "depth#{depth}"
41
+ depth += 1
42
+ end
43
+ end
44
+
45
+ val = if terminal
46
+ CGI.escapeHTML(node.text_value)
47
+ else
48
+ node.elements.map {|el| node_to_html(el, el_cls[el], depth) }.join("")
49
+ end
50
+
51
+ cls ? "<span class='#{cls}'>#{val}</span>" : val
52
+ end
53
+ end
54
+ end
@@ -1,4 +1,4 @@
1
1
  module FoodIngredientParser
2
- VERSION = '1.0.0.pre.5'
3
- VERSION_DATE = '2018-09-07'
2
+ VERSION = '1.0.0.pre.6'
3
+ VERSION_DATE = '2018-09-17'
4
4
  end
@@ -1,2 +1,3 @@
1
1
  require_relative 'food_ingredient_parser/version'
2
- require_relative 'food_ingredient_parser/parser'
2
+ require_relative 'food_ingredient_parser/strict/parser'
3
+ require_relative 'food_ingredient_parser/loose/parser'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_ingredient_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.pre.5
4
+ version: 1.0.0.pre.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-07 00:00:00.000000000 Z
11
+ date: 2018-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -42,20 +42,26 @@ files:
42
42
  - bin/food_ingredient_parser
43
43
  - food_ingredient_parser.gemspec
44
44
  - lib/food_ingredient_parser.rb
45
- - lib/food_ingredient_parser/grammar.rb
46
- - lib/food_ingredient_parser/grammar/amount.treetop
47
- - lib/food_ingredient_parser/grammar/common.treetop
48
- - lib/food_ingredient_parser/grammar/ingredient.treetop
49
- - lib/food_ingredient_parser/grammar/ingredient_coloned.treetop
50
- - lib/food_ingredient_parser/grammar/ingredient_nested.treetop
51
- - lib/food_ingredient_parser/grammar/ingredient_simple.treetop
52
- - lib/food_ingredient_parser/grammar/list.treetop
53
- - lib/food_ingredient_parser/grammar/list_coloned.treetop
54
- - lib/food_ingredient_parser/grammar/list_newlined.treetop
55
- - lib/food_ingredient_parser/grammar/root.treetop
56
- - lib/food_ingredient_parser/nodes.rb
57
- - lib/food_ingredient_parser/parser.rb
58
- - lib/food_ingredient_parser/to_html.rb
45
+ - lib/food_ingredient_parser/cleaner.rb
46
+ - lib/food_ingredient_parser/loose/node.rb
47
+ - lib/food_ingredient_parser/loose/parser.rb
48
+ - lib/food_ingredient_parser/loose/scanner.rb
49
+ - lib/food_ingredient_parser/loose/transform/amount.rb
50
+ - lib/food_ingredient_parser/loose/transform/amount_from_name.treetop
51
+ - lib/food_ingredient_parser/strict/grammar.rb
52
+ - lib/food_ingredient_parser/strict/grammar/amount.treetop
53
+ - lib/food_ingredient_parser/strict/grammar/common.treetop
54
+ - lib/food_ingredient_parser/strict/grammar/ingredient.treetop
55
+ - lib/food_ingredient_parser/strict/grammar/ingredient_coloned.treetop
56
+ - lib/food_ingredient_parser/strict/grammar/ingredient_nested.treetop
57
+ - lib/food_ingredient_parser/strict/grammar/ingredient_simple.treetop
58
+ - lib/food_ingredient_parser/strict/grammar/list.treetop
59
+ - lib/food_ingredient_parser/strict/grammar/list_coloned.treetop
60
+ - lib/food_ingredient_parser/strict/grammar/list_newlined.treetop
61
+ - lib/food_ingredient_parser/strict/grammar/root.treetop
62
+ - lib/food_ingredient_parser/strict/nodes.rb
63
+ - lib/food_ingredient_parser/strict/parser.rb
64
+ - lib/food_ingredient_parser/strict/to_html.rb
59
65
  - lib/food_ingredient_parser/version.rb
60
66
  homepage: https://github.com/q-m/food-ingredient-parser-ruby
61
67
  licenses:
@@ -1,72 +0,0 @@
1
- require 'treetop/runtime'
2
- require_relative 'to_html'
3
-
4
- # Needs to be in grammar namespace so Treetop can find the nodes.
5
- module FoodIngredientParser::Grammar
6
-
7
- # Treetop syntax node with our additions, use this as parent for all our own nodes.
8
- class SyntaxNode < Treetop::Runtime::SyntaxNode
9
- private
10
-
11
- def to_a_deep(n, cls)
12
- if n.is_a?(cls)
13
- [n]
14
- elsif n.nonterminal?
15
- n.elements.map {|m| to_a_deep(m, cls) }.flatten(1).compact
16
- end
17
- end
18
- end
19
-
20
- # Root object, contains everything else.
21
- class RootNode < SyntaxNode
22
- include FoodIngredientParser::ToHtml
23
-
24
- def to_h
25
- h = { contains: contains.to_a }
26
- if notes && notes_ary = to_a_deep(notes, NoteNode)&.map(&:text_value)
27
- h[:notes] = notes_ary if notes_ary.length > 0
28
- end
29
- h
30
- end
31
- end
32
-
33
- # List of ingredients.
34
- class ListNode < SyntaxNode
35
- def to_a
36
- to_a_deep(contains, IngredientNode).map(&:to_h)
37
- end
38
- end
39
-
40
- # Ingredient
41
- class IngredientNode < SyntaxNode
42
- def to_h
43
- h = {}
44
- h.merge!(to_a_deep(ing, IngredientNode)&.first&.to_h || {}) if respond_to?(:ing)
45
- h.merge!(to_a_deep(amount, AmountNode)&.first&.to_h || {}) if respond_to?(:amount)
46
- h[:name] = name.text_value if respond_to?(:name)
47
- h[:name] = pre.text_value + h[:name] if respond_to?(:pre)
48
- h[:name] = h[:name] + post.text_value if respond_to?(:post)
49
- h[:mark] = mark.text_value if respond_to?(:mark) && mark.text_value != ''
50
- h
51
- end
52
- end
53
-
54
- # Ingredient with containing ingredients.
55
- class NestedIngredientNode < IngredientNode
56
- def to_h
57
- super.merge({ contains: to_a_deep(contains, IngredientNode).map(&:to_h) })
58
- end
59
- end
60
-
61
- # Amount, specifying an ingredient.
62
- class AmountNode < SyntaxNode
63
- def to_h
64
- { amount: amount.text_value }
65
- end
66
- end
67
-
68
- # Note at the end of the ingredient list.
69
- class NoteNode < SyntaxNode
70
- end
71
-
72
- end
@@ -1,52 +0,0 @@
1
- require 'cgi'
2
-
3
- # Adds HTML output functionality to a Treetop Node.
4
- #
5
- # The node needs to provide a {#to_h} method (for {#to_html_h}).
6
- #
7
- module FoodIngredientParser::ToHtml
8
-
9
- # Markup original ingredients list text in HTML.
10
- #
11
- # The input text is returned as HTML, augmented with CSS classes
12
- # on +span+s for +name+, +amount+, +mark+ and +note+.
13
- #
14
- # @return [String] HTML representation of ingredient list.
15
- def to_html
16
- node_to_html(self)
17
- end
18
-
19
- private
20
-
21
- def node_to_html(node, cls=nil, depth=0)
22
- el_cls = {} # map of node instances to class names for contained elements
23
- terminal = node.terminal? # whether to look at children elements or not
24
-
25
- if node.is_a?(FoodIngredientParser::Grammar::AmountNode)
26
- cls ||= "amount"
27
- elsif node.is_a?(FoodIngredientParser::Grammar::NoteNode)
28
- cls ||= "note"
29
- terminal = true # NoteNodes may contain other NoteNodes, we want it flat.
30
- elsif node.is_a?(FoodIngredientParser::Grammar::IngredientNode)
31
- el_cls[node.name] = "name" if node.respond_to?(:name)
32
- el_cls[node.mark] = "mark" if node.respond_to?(:mark)
33
- if node.respond_to?(:contains)
34
- el_cls[node.contains] = "contains depth#{depth}"
35
- depth += 1
36
- end
37
- elsif node.is_a?(FoodIngredientParser::Grammar::RootNode)
38
- if node.respond_to?(:contains)
39
- el_cls[node.contains] = "depth#{depth}"
40
- depth += 1
41
- end
42
- end
43
-
44
- val = if terminal
45
- CGI.escapeHTML(node.text_value)
46
- else
47
- node.elements.map {|el| node_to_html(el, el_cls[el], depth) }.join("")
48
- end
49
-
50
- cls ? "<span class='#{cls}'>#{val}</span>" : val
51
- end
52
- end