food_fish_parser 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a92347877837b339f13c2140d2955f41410ea7a6258b34a51f1daf35c4009715
4
- data.tar.gz: 9d554028e69f5925e747054cd6c13f3742014ec37ec0621ae1b6008f72a0a8fc
3
+ metadata.gz: 83e6a16aa71bcfb8c10737828059764fb46bb855bedef1a682695e3aed9a4b31
4
+ data.tar.gz: 866d70e749ca95dcd449ac64b2617e6076fac00e2361c5f252272d27a677fa14
5
5
  SHA512:
6
- metadata.gz: c93ac59e5393093803ad638ab8992deb0a2af35ed7df7f9e3c0d5666d9477d66660f55bce085cea38b9d3a75c7fd6c17f769af56999b15e745cdb55f4540e6ef
7
- data.tar.gz: bf20e42335d25ab91068d2dc8bdd6e8db5a0fbc13595297c97587e0c3a8c4daf3b4668eee32742669cfda32c508e8c2d99bf1fb3dc242c705d1efd2bf63e7550
6
+ metadata.gz: c4dfdf7a85fadc219b423e7d8a95b7d3d0fd4f699657dbcbd4bf43e48b1f89cd4e5975fc48f25c145d56182483a5914fb148475dc50c9d2359f4e3540f7b2ec7
7
+ data.tar.gz: 65f0b596fa65870a130bbfdd4663086fdd6c46c1d9349a2cb3b3561c43b2d0bd541df924af7df976cb946f98c79e716129c005f671a191766a80075d8aae07f1
data/README.md CHANGED
@@ -31,7 +31,9 @@ EOT
31
31
  parser = FoodFishParser::Parser.new
32
32
  puts parser.parse(s).to_a.inspect
33
33
  ```
34
+
34
35
  Results in a list of detected fishes
36
+
35
37
  ```ruby
36
38
  [
37
39
  {
@@ -72,12 +74,98 @@ Results in a list of detected fishes
72
74
  ]
73
75
  ```
74
76
 
77
+ When you have a piece of text and don't know where (or if) any fish details are
78
+ present, you can use the `anywhere` option.
79
+
80
+ ```ruby
81
+ require 'food_fish_parser'
82
+
83
+ parser = FoodFishParser::Parser.new
84
+ s = "tomaat, vis (zalm (salmo salar) gevangen in Noorwegen), zout"
85
+ puts parser.parse(s, anywhere: true).to_a.inspect
86
+ ```
87
+
88
+ This will find as many occurences as possible. It is assumed that all fish details
89
+ in the text have the same amount of information (so fish name plus catch or aquaculture
90
+ information, or only fish names, or only catch or aquaculture information).
91
+ While the parser would normally return nothing, with `anywhere` it returns:
92
+
93
+ ```ruby
94
+ [
95
+ {
96
+ :names => [{ :common=>"zalm", :latin=>"salmo salar" }],
97
+ :catch_areas => [{ :text=>"Noorwegen", :fao_codes=>[] }],
98
+ :catch_methods => [],
99
+ :aquaculture_areas => [],
100
+ :aquaculture_methods => []
101
+ }
102
+ ]
103
+ ```
104
+
105
+ Please note that the `anywhere` option can make the parser much slower.
106
+
107
+
108
+ ## Test tool
109
+
110
+ The executable `food_fish_parser` is available after installing the gem. If you're
111
+ running from the source tree, use `bin/food_fish_parser` instead.
112
+
113
+ ```
114
+ $ food_fish_parser -h
115
+ Usage: bin/food_fish_parser [options] --file|-f <filename>
116
+ bin/food_fish_parser [options] --string|-s <text>
117
+
118
+ -f, --file FILE Parse all lines of the file as fish detail text.
119
+ -s, --string TEXT Parse specified fish detail text.
120
+ -q, --[no-]quiet Only show summary.
121
+ -p, --parsed Only show lines that were successfully parsed.
122
+ -n, --noresult Only show lines that had no result.
123
+ -a, --[no-]anywhere Search for fish details anywhere in the text.
124
+ -e, --[no-]escape Escape newlines
125
+ -c, --[no-]color Use color
126
+ -v, --[no-]verbose Show more data (parsed tree).
127
+ --version Show program version.
128
+ -h, --help Show this help
129
+
130
+ $ food_fish_parser -v -s "salmo salar"
131
+ "salmo salar"
132
+ SyntaxNode+Root6+RootNode+SyntaxNodeAdditions offset=0, "salmo salar" (to_a,to_a_deep):
133
+ SyntaxNode+Root3 offset=0, "salmo salar" (fish_only_names):
134
+ SyntaxNode+FishNode+SyntaxNodeAdditions+FishNameList1 offset=0, "salmo salar" (to_h,to_a_deep,fish_name):
135
+ SyntaxNode+FishNameNode+SyntaxNodeAdditions+FishNameLatin1+FishNameLatinNode offset=0, "salmo salar" (to_h,to_a_deep,fish_name_latin_first):
136
+ SyntaxNode offset=0, "salmo"
137
+ SyntaxNode+FishNameLatin0 offset=5, " salar" (fish_name_latin_second):
138
+ SyntaxNode offset=5, " ":
139
+ SyntaxNode offset=5, " "
140
+ SyntaxNode offset=6, "salar"
141
+ SyntaxNode offset=11, ""
142
+ SyntaxNode offset=11, ""
143
+ SyntaxNode offset=11, ""
144
+ SyntaxNode offset=11, ""
145
+ SyntaxNode offset=11, ""
146
+ [
147
+ {
148
+ :names=>[{:common=>nil, :latin=>"salmo salar"}],
149
+ :catch_areas=>[],
150
+ :catch_methods=>[],
151
+ :aquaculture_areas=>[],
152
+ :aquaculture_methods=>[]
153
+ }
154
+ ]
155
+
156
+ $ food_fish_parser -q -f data/test-cases
157
+ parsed 51 (100.0%), no result 0 (0.0%)
158
+ ```
159
+
160
+ If you want to use the output in (shell)scripts, the options `-e -c` may be quite useful.
161
+
75
162
 
76
163
  ## Test data
77
164
 
78
- [`data/fish-ingredient-samples-qm-nl`](data/fish-ingredient-samples-qm-nl) contains about 2k
165
+ [`data/fish-ingredient-samples-qm-nl`](data/fish-ingredient-samples-qm-nl) contains about 2k
79
166
  real-world ingredient lists with fish found on the Dutch market. Each line contains one ingredient
80
- list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored).
167
+ list (newlines are encoded as `\n`, empty lines and those starting with `#` are ignored). Of those,
168
+ something is returned for 99.8% of them (with the `anywhere` option), but quality varies greatly.
81
169
 
82
170
 
83
171
  ## Species
data/bin/food_fish_parser CHANGED
@@ -31,8 +31,8 @@ def colorize(color, s)
31
31
  end
32
32
  end
33
33
 
34
- def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false)
35
- parsed ||= parser.parse(s)
34
+ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false, color: false, anywhere: false)
35
+ parsed ||= parser.parse(s, anywhere: anywhere)
36
36
 
37
37
  return unless print.nil? || (parsed && print == :parsed) || (!parsed && print == :noresult)
38
38
 
@@ -48,14 +48,14 @@ def parse_single(s, parsed=nil, parser:, verbosity: 1, print: nil, escape: false
48
48
  end
49
49
  end
50
50
 
51
- def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false)
51
+ def parse_file(path, parser:, verbosity: 1, print: nil, escape: false, color: false, anywhere: false)
52
52
  count_parsed = count_noresult = 0
53
53
  File.foreach(path) do |line|
54
54
  next if line =~ /^#/ # comment
55
55
  next if line =~ /^\s*$/ # empty line
56
56
 
57
57
  line = line.gsub('\\n', "\n").strip
58
- parsed = parser.parse(line)
58
+ parsed = parser.parse(line, anywhere: anywhere)
59
59
  count_parsed += 1 if parsed
60
60
  count_noresult += 1 unless parsed
61
61
 
@@ -72,6 +72,7 @@ verbosity = 1
72
72
  files = []
73
73
  strings = []
74
74
  print = nil
75
+ anywhere = false
75
76
  escape = false
76
77
  color = true
77
78
  OptionParser.new do |opts|
@@ -87,6 +88,7 @@ OptionParser.new do |opts|
87
88
  opts.on("-q", "--[no-]quiet", "Only show summary.") {|q| verbosity = q ? 0 : 1 }
88
89
  opts.on("-p", "--parsed", "Only show lines that were successfully parsed.") {|p| print = :parsed }
89
90
  opts.on("-n", "--noresult", "Only show lines that had no result.") {|p| print = :noresult }
91
+ opts.on("-a", "--[no-]anywhere", "Search for fish details anywhere in the text.") {|a| anywhere = !!a }
90
92
  opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
91
93
  opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
92
94
  opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
@@ -103,8 +105,8 @@ end.parse!
103
105
  if strings.any? || files.any?
104
106
  parser = FoodFishParser::Parser.new
105
107
  success = true
106
- strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
107
- files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color) }
108
+ strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
109
+ files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
108
110
  success or exit(1)
109
111
  else
110
112
  STDERR.puts("Please specify one or more --file or --string arguments (see --help).")
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
12
12
  s.license = 'MIT'
13
13
  s.description = <<-EOD
14
14
  Food products that contain fish sometimes indicate details like fishing
15
- area, method or aquaculture country. This parser know about various ways
15
+ area, method or aquaculture country. This parser knows about various ways
16
16
  this is found on a product package, and returns a structured representation
17
17
  of the fish ingredient details.
18
18
  EOD
@@ -7,13 +7,18 @@ module FoodFishParser::Grammar
7
7
 
8
8
  rule aquac_area_indicator
9
9
  (
10
- 'uit'i / 'gekweekt in'i / 'gekweekt op'i /
10
+ 'gekweekt in'i / 'gekweekt op'i /
11
11
  'aquacultuurproduct uit'i / 'aquacultuur product uit'i
12
12
  )
13
13
  !char
14
14
  ( ws* ( ':' / '>' ) )?
15
15
  end
16
16
 
17
+ rule aquac_area_indicator_short
18
+ aquac_area_indicator /
19
+ ( 'uit'i ) !char ( ws* ( ':' / '>' ) )?
20
+ end
21
+
17
22
  rule aquac_area_content
18
23
  (
19
24
  ( area:( words ) ( ws* comma? ws* fao_area_list_enclosures )? ) /
@@ -3,7 +3,7 @@ module FoodFishParser::Grammar
3
3
 
4
4
  # whitespace
5
5
  rule ws
6
- [ \t]
6
+ [ \t\r\n]
7
7
  end
8
8
 
9
9
  rule char
@@ -4,12 +4,13 @@ module FoodFishParser::Grammar
4
4
  include Common
5
5
 
6
6
  rule fish_name_nl
7
+ ( 'verse'i ws+ )?
7
8
  ( fish_name_nl_area ws+ )? ( fish_name_nl_attr ws* )? fish_name_nl_name fish_name_nl_suffix?
8
9
  <FishNameCommonNode>
9
10
  end
10
11
 
11
12
  rule fish_name_nl_area
12
- 'pacifische'i / 'indische'i / 'groenlandse'i / 'atlantische'i / 'argentijnse'i / 'alaska'i
13
+ 'pacifische'i / 'noorse'i / 'indische'i / 'groenlandse'i / 'atlantische'i / 'argentijnse'i / 'alaska'i
13
14
  end
14
15
 
15
16
  rule fish_name_nl_attr
@@ -7,27 +7,61 @@ module FoodFishParser::Grammar
7
7
  include AquacArea
8
8
  include AquacMethod
9
9
 
10
+ # Regular root node that requires all text to match fish details.
11
+ #
12
+ # Note that here we prefer fish with catch or aquaculture info,
13
+ # then try fish names, and finally only catch or aquaculture info.
14
+ #
15
+ # The assumption is that all declared fish would have the same amount
16
+ # of information. Hence we first try for all info, then resort to partial
17
+ # info.
10
18
  rule root
11
- fishes:(
12
- ( fish ( ws* and_or ws* fish )+ ) /
13
- ( fish ( ws* ( '.' / comma ) ws* fish )+ ) /
14
- fish
19
+ (
20
+ ( fish_with_info ( ws* fish_sep ws* fish_with_info )* ) /
21
+ ( fish_only_names ( ws* fish_sep ws* fish_only_names )* ) /
22
+ ( fish_only_info ( ws* fish_sep ws* fish_only_info )* )
15
23
  )
16
- ( ws* '.' )?
24
+ ws* '.'? ws*
17
25
  <RootNode>
18
26
  end
19
27
 
20
- rule fish
28
+ # Alternate root node that allows fish details to be interspaced with other text.
29
+ #
30
+ # Note that this can be a much more expensive operation.
31
+ rule root_anywhere
32
+ (
33
+ ( !fish_with_info . )* ( fish_with_info ( ( !fish_with_info . )* fish_with_info )* ) ( !fish_with_info . )* /
34
+ ( !fish_only_names . )* ( fish_only_names ( ( !fish_only_names . )+ fish_only_names )* ) ( !fish_only_names . )* /
35
+ ( !fish_only_info . )* ( fish_only_info ( ( !fish_only_info . )+ fish_only_info )* ) ( !fish_only_info . )*
36
+ )
37
+ <RootNode>
38
+ end
39
+
40
+ # separator between fish declarations
41
+ rule fish_sep
42
+ and_or / '.' / comma
43
+ end
44
+
45
+ # fish with catch or aquaculture info
46
+ rule fish_with_info
21
47
  (
22
48
  ( fish_name_list ( ws* ( comma / ':' ) )? ws+ fish_catch_info ) /
23
- ( fish_name_list ( ws* ( comma / ':' ) )? ws+ fish_aquac_info ) /
24
- fish_name_list /
25
- fish_catch_info /
26
- fish_aquac_info
49
+ ( fish_name_list ( ws* ( comma / ':' ) )? ws+ fish_aquac_info )
27
50
  )
28
51
  <FishNode>
29
52
  end
30
53
 
54
+ # fish names only
55
+ rule fish_only_names
56
+ fish_name_list <FishNode>
57
+ end
58
+
59
+ # catch or aquaculture info only (no names)
60
+ rule fish_only_info
61
+ ( fish_catch_info / fish_aquac_info ) <FishNode>
62
+ end
63
+
64
+
31
65
  rule fish_catch_info
32
66
  (
33
67
  catch_method_indicator ws* catch_method_content
@@ -47,9 +81,8 @@ module FoodFishParser::Grammar
47
81
  ( ( ws* comma )? ws+ aquac_method_indicator ws* aquac_method_content )?
48
82
  ) / (
49
83
  aquac_method_indicator ws* aquac_method_content
50
- ( ( ws* comma )? ws+ aquac_area_indicator ws* aquac_area_content )?
84
+ ( ( ws* comma )? ws+ aquac_area_indicator_short ws* aquac_area_content )?
51
85
  )
52
86
  end
53
-
54
87
  end
55
88
  end
@@ -39,6 +39,7 @@ module FoodFishParser::Grammar
39
39
  'vangstmethode'i /
40
40
  'vangsmethode'i /
41
41
  'betrapt'i /
42
+ 'aquacultuurproduct'i /
42
43
  'gekweekt'i /
43
44
  'kweekmethode'i /
44
45
  'kweekmethoden'i /
@@ -19,7 +19,7 @@ module FoodFishParser
19
19
  module RootNode
20
20
  include SyntaxNodeAdditions
21
21
  def to_a
22
- to_a_deep(fishes, FishNode).map(&:to_h)
22
+ to_a_deep(self, FishNode).map(&:to_h)
23
23
  end
24
24
  end
25
25
 
@@ -16,9 +16,15 @@ module FoodFishParser
16
16
 
17
17
  # Parse food fish text into a structured representation.
18
18
  #
19
+ # @param s [String] text to parse
20
+ # @param anywhere [Bool] +false+ assume the text is only fish details, +true+ to search for fish details in the text
19
21
  # @return [FoodFishParser::Grammar::RootNode] structured representation of fish details
20
22
  # @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
21
- def parse(s, **options)
23
+ def parse(s, anywhere: false, **options)
24
+ if anywhere
25
+ options = options.merge(root: :root_anywhere, consume_all_input: false)
26
+ end
27
+
22
28
  @parser.parse(s, **options)
23
29
  end
24
30
 
@@ -1,4 +1,4 @@
1
1
  module FoodFishParser
2
- VERSION = '0.1.0'
3
- VERSION_DATE = '2020-03-17'
2
+ VERSION = '0.2.0'
3
+ VERSION_DATE = '2020-03-18'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: food_fish_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - wvengen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-03-17 00:00:00.000000000 Z
11
+ date: 2020-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: treetop
@@ -26,7 +26,7 @@ dependencies:
26
26
  version: '1.6'
27
27
  description: |2
28
28
  Food products that contain fish sometimes indicate details like fishing
29
- area, method or aquaculture country. This parser know about various ways
29
+ area, method or aquaculture country. This parser knows about various ways
30
30
  this is found on a product package, and returns a structured representation
31
31
  of the fish ingredient details.
32
32
  email: