food_fish_parser 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +44 -2
- data/bin/food_fish_parser +11 -2
- data/lib/food_fish_parser/flat/fao_region.rb +35 -0
- data/lib/food_fish_parser/flat/fish_name.rb +34 -0
- data/lib/food_fish_parser/flat/fish_name_latin.rb +17 -0
- data/lib/food_fish_parser/flat/fish_name_nl.rb +20 -0
- data/lib/food_fish_parser/flat/kind.rb +41 -0
- data/lib/food_fish_parser/flat/parser.rb +52 -0
- data/lib/food_fish_parser/{grammar → strict/grammar}/aquac_area.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/aquac_method.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/catch_area.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/catch_method.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/common.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/fao_area.treetop +4 -3
- data/lib/food_fish_parser/strict/grammar/fish_name.treetop +30 -0
- data/lib/food_fish_parser/{grammar → strict/grammar}/fish_name_latin.treetop +4 -4
- data/lib/food_fish_parser/{grammar → strict/grammar}/fish_name_nl.treetop +2 -2
- data/lib/food_fish_parser/{grammar → strict/grammar}/root.treetop +24 -10
- data/lib/food_fish_parser/{grammar → strict/grammar}/words.treetop +1 -1
- data/lib/food_fish_parser/{grammar.rb → strict/grammar.rb} +0 -0
- data/lib/food_fish_parser/{nodes.rb → strict/nodes.rb} +1 -1
- data/lib/food_fish_parser/strict/parser.rb +34 -0
- data/lib/food_fish_parser/version.rb +2 -2
- data/lib/food_fish_parser.rb +2 -1
- metadata +22 -16
- data/lib/food_fish_parser/grammar/fish_name.treetop +0 -21
- data/lib/food_fish_parser/parser.rb +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e92312ff6eec700a2c57b1c48cad77fb9710dd3086b2fa4ded31e352d0bef9c5
|
4
|
+
data.tar.gz: 404c5692f4a731e1614c9c4f8628ddf01f599192740e205c4e9f064411d6e18a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2de059ea4900f0aea03194559b275f1c7b307c1c06dc6a0b02a1bf17c21bccdc5a8fce98ad6fa178ad99181ad0ab36d5a609c82edd960280c46efbc2f1e71cb7
|
7
|
+
data.tar.gz: 7fdc6287c1285c7868d784d771b0f983d0c4fb975e3fc82eeff52a2f80768b12455caebad5d487b4b1b7816126f33ec6c51e04e91d498bb128e500aa34e183d2
|
data/README.md
CHANGED
@@ -18,6 +18,8 @@ gem install food_fish_parser
|
|
18
18
|
|
19
19
|
## Example
|
20
20
|
|
21
|
+
### Strict parser
|
22
|
+
|
21
23
|
```ruby
|
22
24
|
require 'food_fish_parser'
|
23
25
|
|
@@ -28,7 +30,7 @@ s = <<EOT.gsub(/\n/, '').strip
|
|
28
30
|
kabeljauw (gadus macrocephalus), vangstgebied stille oceaan fao 67, garnaal
|
29
31
|
(litopenaeus vannamei), gekweekt in ecuador, kweekmethode: vijver.
|
30
32
|
EOT
|
31
|
-
parser = FoodFishParser::Parser.new
|
33
|
+
parser = FoodFishParser::Strict::Parser.new
|
32
34
|
puts parser.parse(s).to_a.inspect
|
33
35
|
```
|
34
36
|
|
@@ -74,13 +76,15 @@ Results in a list of detected fishes
|
|
74
76
|
]
|
75
77
|
```
|
76
78
|
|
79
|
+
### Anywhere
|
80
|
+
|
77
81
|
When you have a piece of text and don't know where (or if) any fish details are
|
78
82
|
present, you can use the `anywhere` option.
|
79
83
|
|
80
84
|
```ruby
|
81
85
|
require 'food_fish_parser'
|
82
86
|
|
83
|
-
parser = FoodFishParser::Parser.new
|
87
|
+
parser = FoodFishParser::Strict::Parser.new
|
84
88
|
s = "tomaat, vis (zalm (salmo salar) gevangen in Noorwegen), zout"
|
85
89
|
puts parser.parse(s, anywhere: true).to_a.inspect
|
86
90
|
```
|
@@ -104,6 +108,43 @@ While the parser would normally return nothing, with `anywhere` it returns:
|
|
104
108
|
|
105
109
|
Please note that the `anywhere` option can make the parser much slower.
|
106
110
|
|
111
|
+
### Flat parser
|
112
|
+
|
113
|
+
While the strict parser can recognize the structure of multiple fishes, it is really
|
114
|
+
strict about what it expects. Many cases are not recognized, or sometimes incomplete.
|
115
|
+
|
116
|
+
The flat parser does basic named entity recognition anywhere in the text. Any structure
|
117
|
+
is lost, so it always returns an array with one or zero items - but you get all the
|
118
|
+
FAO regions and fish names found.
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
require 'food_fish_parser'
|
122
|
+
|
123
|
+
parser = FoodFishParser::Flat::Parser.new
|
124
|
+
s = "Foobar zalm (salmo salar) *&! gevangen pangasius spp FAO 61 ?or ?FAO 67 what more.")
|
125
|
+
puts parser.parse(s).to_a.inspect
|
126
|
+
```
|
127
|
+
|
128
|
+
```
|
129
|
+
[
|
130
|
+
{
|
131
|
+
:names => [
|
132
|
+
{ :common=>"zalm", :latin=>"salmo salar" },
|
133
|
+
{ :common=>nil, :latin=>"pangasius spp" }
|
134
|
+
],
|
135
|
+
:catch_areas => [
|
136
|
+
{ :name=>nil, :fao_codes=>["61"] },
|
137
|
+
{ :name=>nil, :fao_codes=>["67"] }
|
138
|
+
],
|
139
|
+
:catch_methods => [],
|
140
|
+
:aquaculture_areas => [],
|
141
|
+
:aquaculture_methods => []
|
142
|
+
}
|
143
|
+
]
|
144
|
+
```
|
145
|
+
|
146
|
+
This might be expanded to more information at some point.
|
147
|
+
|
107
148
|
|
108
149
|
## Test tool
|
109
150
|
|
@@ -120,6 +161,7 @@ Usage: bin/food_fish_parser [options] --file|-f <filename>
|
|
120
161
|
-q, --[no-]quiet Only show summary.
|
121
162
|
-p, --parsed Only show lines that were successfully parsed.
|
122
163
|
-n, --noresult Only show lines that had no result.
|
164
|
+
-r, --parser PARSER Use specific parser (strict, flat).
|
123
165
|
-a, --[no-]anywhere Search for fish details anywhere in the text.
|
124
166
|
-e, --[no-]escape Escape newlines
|
125
167
|
-c, --[no-]color Use color
|
data/bin/food_fish_parser
CHANGED
@@ -72,9 +72,14 @@ verbosity = 1
|
|
72
72
|
files = []
|
73
73
|
strings = []
|
74
74
|
print = nil
|
75
|
+
parser_name = :strict
|
75
76
|
anywhere = false
|
76
77
|
escape = false
|
77
78
|
color = true
|
79
|
+
PARSERS = {
|
80
|
+
strict: FoodFishParser::Strict::Parser,
|
81
|
+
flat: FoodFishParser::Flat::Parser
|
82
|
+
}
|
78
83
|
OptionParser.new do |opts|
|
79
84
|
opts.banner = <<-EOF.gsub(/^ /, '')
|
80
85
|
Usage: #{$0} [options] --file|-f <filename>
|
@@ -88,7 +93,8 @@ OptionParser.new do |opts|
|
|
88
93
|
opts.on("-q", "--[no-]quiet", "Only show summary.") {|q| verbosity = q ? 0 : 1 }
|
89
94
|
opts.on("-p", "--parsed", "Only show lines that were successfully parsed.") {|p| print = :parsed }
|
90
95
|
opts.on("-n", "--noresult", "Only show lines that had no result.") {|p| print = :noresult }
|
91
|
-
opts.on("-
|
96
|
+
opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
|
97
|
+
opts.on("-a", "--[no-]anywhere", "Search for fish details anywhere in the text (only strict parser).") {|a| anywhere = !!a }
|
92
98
|
opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
|
93
99
|
opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
|
94
100
|
opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
|
@@ -103,7 +109,10 @@ OptionParser.new do |opts|
|
|
103
109
|
end.parse!
|
104
110
|
|
105
111
|
if strings.any? || files.any?
|
106
|
-
parser =
|
112
|
+
unless parser = PARSERS[parser_name]&.new
|
113
|
+
STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
|
114
|
+
exit(1)
|
115
|
+
end
|
107
116
|
success = true
|
108
117
|
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
|
109
118
|
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FoodFishParser
|
2
|
+
module Flat
|
3
|
+
module FaoRegion
|
4
|
+
|
5
|
+
REGEX_CODE = / 0?\d{2} (?: (?: -0?\d{2} | (?: \/ | - | \s+ | ) [lIV]+ ) [a-d]? )? /ix
|
6
|
+
REGEX = /
|
7
|
+
\b FA[O0]
|
8
|
+
\s* (?: -? (?:zone|gebied) )? \s* (?:nummer|nr\.?|,)?
|
9
|
+
\s*
|
10
|
+
(
|
11
|
+
#{REGEX_CODE}
|
12
|
+
(?:
|
13
|
+
(?: ,?\s*en\s* #{REGEX_CODE} ) |
|
14
|
+
(?: \/ #{REGEX_CODE} )+ |
|
15
|
+
(?: (?: ,\s* #{REGEX_CODE} )+ (?: ,?\s*en\s* #{REGEX_CODE} )? )
|
16
|
+
)?
|
17
|
+
)
|
18
|
+
\b
|
19
|
+
/ix
|
20
|
+
|
21
|
+
def self.find_all_code_groups(text)
|
22
|
+
text
|
23
|
+
.scan(REGEX)
|
24
|
+
.flatten(1)
|
25
|
+
.map {|s| s.split(/\s*(?: en | ,\s*en | , | \/ )\s*/xi) }
|
26
|
+
.map {|a| a.map {|s| s.gsub(/\A0(\d{2})\z/, '\1') } } # normalize trailing zeroes
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.find_all(text)
|
30
|
+
find_all_code_groups(text)
|
31
|
+
.map {|a| { name: nil, fao_codes: a } }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'fish_name_latin'
|
2
|
+
require_relative 'fish_name_nl'
|
3
|
+
|
4
|
+
module FoodFishParser
|
5
|
+
module Flat
|
6
|
+
module FishName
|
7
|
+
REGEX_VIS = /\s*(?: \(\s*vis\s*\) | \b,?\s*vis\s*,?\s*\b )/ix
|
8
|
+
|
9
|
+
REGEX = /
|
10
|
+
(?:
|
11
|
+
#{FishNameNL::REGEX} (?: #{REGEX_VIS} )? \s* \( \s* #{FishNameLatin::REGEX} \s* \) |
|
12
|
+
#{FishNameNL::REGEX} \s*,?\s* \b #{FishNameLatin::REGEX} |
|
13
|
+
#{FishNameLatin::REGEX} |
|
14
|
+
#{FishNameNL::REGEX}
|
15
|
+
)
|
16
|
+
/ix
|
17
|
+
|
18
|
+
def self.find_all(text)
|
19
|
+
# Because scan doesn't support named captures, we have to use numbered capture groups.
|
20
|
+
# Make sure to keep all groups you don't want to reference below as non-capturing groups.
|
21
|
+
# Each name regex has a capture group (so as to avoid noise), so you don't see them here.
|
22
|
+
# The order of the captures corresponds to the order of the fish names in the regex above.
|
23
|
+
text.scan(REGEX).map do |m|
|
24
|
+
case
|
25
|
+
when m[0] && m[1] then { common: m[0], latin: m[1] }
|
26
|
+
when m[2] && m[3] then { common: m[2], latin: m[3] }
|
27
|
+
when m[4] then { common: nil, latin: m[4] }
|
28
|
+
when m[5] then { common: m[5], latin: nil }
|
29
|
+
end
|
30
|
+
end.compact
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# autogenerated by species-treetop-gen-latin.rb on 2020-03-19
|
2
|
+
module FoodFishParser
|
3
|
+
module Flat
|
4
|
+
module FishNameLatin
|
5
|
+
REGEX_FIRST = /zygochlamys|zeus|xiphopenaeus|xiphias|undaria|ulva|trichiurus|trachurus|todarodes|thunnus|theragra|stolephorus|sprattus|spirulina|sparus|solea|sepiella|sepia|sebastes|scomber|sardinella|sardina|salmo|saccharina|reinhardtius|psetta|procambarus|portunus|porphyra|pollachius|pleuronectes|pleoticus|placopecten|phymatolithon|perna|penaeus|penaeidae|pelvetia|pecten|patinopecten|parapenaeopsis|paralomis|paphia|pangasius|pandalus|palmaria|pagellus|ovalipes|ostrea|oreochromis|oncorhynchus|octopus|nephrops|nemipterus|nelumbo|mytilus|mulinia|micromesistius|metapenaeus|merluccius|merlangius|melanogrammus|macruronus|macrocystis|lophius|loligo|litopenaeus|lithodes|limanda|lethrinus|lepidotrigla|lepidopsetta|lates|laminaria|katwonus|katwomus|katsuwonus|katsuwomus|illex|homarus|himanthalia|haematococcus|gracilaria|gelidium|gadus|fucus|euthynnus|ensis|engraulis|dunaliella|dosidicus|dicentrarchus|crassostrea|crangon|clupea|clarias|chondrus|chlorella|cerastoderma|caulerpa|ascophyllum|anguilla|anadara|alle|alaria|acipenser/i
|
6
|
+
REGEX_SECND = /yessoensis|vulgaris|virens|vesiculosus|verrucosa|vannamei|undulata|umbilicalis|tenera|stylifera|sprattus|spp\.|spp|sombrus|solea|solar|scombrus|santolla|salina|salar|ringens|pyrifera|pyrenoidosa|punctatus|productus|pluvialis|platessa|platensis|piscatorius|pinnatifida|pilchardus|pelatis|pelanis|pelamis|pelagicus|patagonica|pangasius|palmata|pacificus|officinalis|ocellatus|nucifera|novaezelandiae|novaezealandiae|norvegicus|nodosum|niloticus|nerka|mykiss|mykis|myki\?s|murphyi|muelleri|morhue|morhua|monodon|monoceros|microptera|merluccius|merlangus|merguiensis|maximus|maxima|marinus|magellanicus|macrocephalus|limanda|lepturus|lentillifera|latissima|lactuca|labrax|kroyeri|kisutch|keta|jordani|japonicus|japonica|hyptophthalmus|hypophtalmus|hippoglossoides|hexodon|harengus|gueldenstaedtii|granulosa|gorbusche|gorbuscha|gladius|gigas|gibbosa|gariepinus|galloprovincialis|faber|esculenta|encrasicolus|elongata|edulis|edule|directus|digitata|crispus|crangon|clarkii|chilensis|chalcogramma|capensis|cannamei|canaliculus|canaliculata|calcareum|borealis|bogaraveo|bilineata|australis|aurata|argentinus|antiquata|anquilla|anguilla|anchoita|americanus|alle|albacares|alalunga|aeglefinus/i
|
7
|
+
|
8
|
+
REGEX = /
|
9
|
+
\b
|
10
|
+
(
|
11
|
+
#{REGEX_FIRST} (?: \s+ #{REGEX_SECND} )?
|
12
|
+
)
|
13
|
+
\b
|
14
|
+
/ix
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# autogenerated by species-treetop-gen-nl.rb on 2020-03-19
|
2
|
+
module FoodFishParser
|
3
|
+
module Flat
|
4
|
+
module FishNameNL
|
5
|
+
REGEX_AREA = /pacifische|noorse|indische|groenlandse|atlantische|argentijnse|alaska/i
|
6
|
+
REGEX_ATTR = /zwarte|zwart|witte|witpoot|wit|roze|rood|rode|rivier|pijl|kleine|klein|grote|groot|groene|groen|doorn|coho|chum|blauwe|blauw/i
|
7
|
+
REGEX_NAME = /zonnevis|zeewolf|zeesnoek|zeekreeft|zeeforel|zeebaars|zalm|wijting|weekdieren|weekdier|vintonijn|tonijn\ albacore|tonijn|tong|tilapia|thon\ albacore|tarbot|tapijtschelp|sprot|spie|snotolf|snoekbaars|snoek|skipjack\ tonijn|schol|schelvis|schelpen|schelp|schar|sardines|regenboogforel|raat|poon|pollock|pollak|pangasius|paling|oogtonijn|mul|mosselen|mossel|meerval|mantelschelp|makreel|lom|leng|kreeft|koolvis|kokkel|karper|kabeljauw|hondstong|hoki|heilbot|heek|hake|haai|ha|gruis|griet|geep|geelvintonijn|garnalen|garnaal|fint|coquilles|cocquilles|botervis|bot|beekridder|barracuda|baars|arkschelp|ansjovis|albacore\ tonijn/i
|
8
|
+
REGEX_SUFX = /vlees|ringen|ring|filets|filet/i
|
9
|
+
|
10
|
+
REGEX = /
|
11
|
+
(?: \b verse \s+ )?
|
12
|
+
\b
|
13
|
+
(
|
14
|
+
(?: #{REGEX_AREA} \s+ )? (?: #{REGEX_ATTR} \s* )? #{REGEX_NAME} (?: #{REGEX_SUFX} )?
|
15
|
+
)
|
16
|
+
\b
|
17
|
+
/ix
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'fish_name_nl'
|
2
|
+
|
3
|
+
module FoodFishParser
|
4
|
+
module Flat
|
5
|
+
module Kind
|
6
|
+
|
7
|
+
REGEX_WILD = /
|
8
|
+
\b
|
9
|
+
(?:
|
10
|
+
gevangen |
|
11
|
+
visgebied |
|
12
|
+
vangstgebied |
|
13
|
+
vangsgebied |
|
14
|
+
betrapt \s+ bij |
|
15
|
+
wilde? \s+ #{FishNameNL::REGEX}
|
16
|
+
)
|
17
|
+
\b
|
18
|
+
/ix
|
19
|
+
|
20
|
+
REGEX_AQUAC = /
|
21
|
+
\b
|
22
|
+
(?:
|
23
|
+
gekweekt |
|
24
|
+
aquacultuurproduct |
|
25
|
+
aquacultuur \s+ product |
|
26
|
+
kweekmethode |
|
27
|
+
kweekmethoden
|
28
|
+
)
|
29
|
+
\b
|
30
|
+
/ix
|
31
|
+
|
32
|
+
def self.is_wild?(text)
|
33
|
+
!!REGEX_WILD.match(text)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.is_aquac?(text)
|
37
|
+
!!REGEX_AQUAC.match(text)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative 'fish_name'
|
2
|
+
require_relative 'fao_region'
|
3
|
+
require_relative 'kind'
|
4
|
+
|
5
|
+
module FoodFishParser
|
6
|
+
module Flat
|
7
|
+
class Parser
|
8
|
+
|
9
|
+
# Create a new fish detail parser
|
10
|
+
# @return [FoodFishParser::Flat::Parser]
|
11
|
+
def initialize
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse food fish text into a structured representation.
|
15
|
+
#
|
16
|
+
# @param s [String] text to parse
|
17
|
+
# @return [Array<Hash>] structured representation of fish details (maximum one item)
|
18
|
+
def parse(s, **options)
|
19
|
+
names = FishName.find_all(s)
|
20
|
+
areas = FaoRegion.find_all(s)
|
21
|
+
methods = [] # @todo
|
22
|
+
|
23
|
+
is_wild = Kind.is_wild?(s)
|
24
|
+
is_aquac = Kind.is_aquac?(s)
|
25
|
+
|
26
|
+
return [] unless names.any? || areas.any?
|
27
|
+
|
28
|
+
attrs = {
|
29
|
+
names: names,
|
30
|
+
catch_areas: nil,
|
31
|
+
catch_methods: nil,
|
32
|
+
aquaculture_areas: nil,
|
33
|
+
aquaculture_methods: nil
|
34
|
+
}
|
35
|
+
|
36
|
+
if is_wild && !is_aquac
|
37
|
+
[attrs.merge(catch_areas: areas, catch_methods: methods)]
|
38
|
+
elsif !is_wild && is_aquac
|
39
|
+
[attrs.merge(aquaculture_areas: areas, aquaculture_methods: methods)]
|
40
|
+
elsif areas.any? || methods.any?
|
41
|
+
# We have a problem: either there are multiple fish and they're a mix of
|
42
|
+
# wild and aquaculture fish, or there is no such indication at all.
|
43
|
+
# For now, we return it in a non-standard way (this needs to be tackled).
|
44
|
+
[attrs.merge(areas: areas, methods: methods)]
|
45
|
+
else
|
46
|
+
# just names
|
47
|
+
[attrs]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
2
|
grammar FaoArea
|
3
3
|
include Common
|
4
4
|
|
@@ -45,8 +45,9 @@ module FoodFishParser::Grammar
|
|
45
45
|
|
46
46
|
rule fao_area_sub_code
|
47
47
|
(
|
48
|
-
|
49
|
-
( dash
|
48
|
+
# IVX for roman numals, but l for incorrectly OCR-ed text
|
49
|
+
( ( dash / '/' / ws* )? [ivxIVXl]+ ) /
|
50
|
+
( dash [0-9]+ )
|
50
51
|
)
|
51
52
|
fao_area_suffix?
|
52
53
|
(
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
|
+
grammar FishName
|
3
|
+
include Common
|
4
|
+
include FishNameLatin
|
5
|
+
include FishNameNL
|
6
|
+
|
7
|
+
rule fish_name_both
|
8
|
+
( fish_name_nl ws* '(' ws* fish_name_latin ( ws* ')' / comma )? )
|
9
|
+
end
|
10
|
+
|
11
|
+
rule fish_name_both_list
|
12
|
+
( fish_name_both <FishNameNode> )
|
13
|
+
( ws+ and_or ws+ fish_name_both <FishNameNode> )*
|
14
|
+
end
|
15
|
+
|
16
|
+
rule fish_name_latin_list
|
17
|
+
( fish_name_latin <FishNameNode> )
|
18
|
+
( ws+ and_or ws+ fish_name_latin <FishNameNode> )*
|
19
|
+
end
|
20
|
+
|
21
|
+
rule fish_name_nl_list
|
22
|
+
( fish_name_nl <FishNameNode> )
|
23
|
+
( ws+ and_or ws+ fish_name_nl <FishNameNode> )*
|
24
|
+
end
|
25
|
+
|
26
|
+
rule fish_name_any_list
|
27
|
+
fish_name_both_list / fish_name_latin_list / fish_name_nl_list
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
# autogenerated by species-treetop-gen-latin.rb on 2020-03-
|
2
|
-
module FoodFishParser::Grammar
|
1
|
+
# autogenerated by species-treetop-gen-latin.rb on 2020-03-19
|
2
|
+
module FoodFishParser::Strict::Grammar
|
3
3
|
grammar FishNameLatin
|
4
4
|
include Common
|
5
5
|
|
@@ -9,11 +9,11 @@ module FoodFishParser::Grammar
|
|
9
9
|
end
|
10
10
|
|
11
11
|
rule fish_name_latin_first
|
12
|
-
'zygochlamys'i / 'zeus'i / 'xiphopenaeus'i / 'xiphias'i / 'undaria'i / 'ulva'i / 'trichiurus'i / 'trachurus'i / 'todarodes'i / 'thunnus'i / 'theragra'i / 'stolephorus'i / 'sprattus'i / 'spirulina'i / 'sparus'i / 'solea'i / 'sepiella'i / 'sepia'i / 'sebastes'i / 'scomber'i / 'sardinella'i / 'sardina'i / 'salmo'i / 'saccharina'i / 'reinhardtius'i / 'psetta'i / 'procambarus'i / 'portunus'i / 'porphyra'i / 'pollachius'i / 'pleuronectes'i / 'pleoticus'i / 'placopecten'i / 'phymatolithon'i / 'perna'i / 'penaeus'i / 'penaeidae'i / 'pelvetia'i / 'pecten'i / 'patinopecten'i / 'parapenaeopsis'i / 'paralomis'i / 'paphia'i / 'pangasius'i / 'pandalus'i / 'palmaria'i / 'pagellus'i / '
|
12
|
+
'zygochlamys'i / 'zeus'i / 'xiphopenaeus'i / 'xiphias'i / 'undaria'i / 'ulva'i / 'trichiurus'i / 'trachurus'i / 'todarodes'i / 'thunnus'i / 'theragra'i / 'stolephorus'i / 'sprattus'i / 'spirulina'i / 'sparus'i / 'solea'i / 'sepiella'i / 'sepia'i / 'sebastes'i / 'scomber'i / 'sardinella'i / 'sardina'i / 'salmo'i / 'saccharina'i / 'reinhardtius'i / 'psetta'i / 'procambarus'i / 'portunus'i / 'porphyra'i / 'pollachius'i / 'pleuronectes'i / 'pleoticus'i / 'placopecten'i / 'phymatolithon'i / 'perna'i / 'penaeus'i / 'penaeidae'i / 'pelvetia'i / 'pecten'i / 'patinopecten'i / 'parapenaeopsis'i / 'paralomis'i / 'paphia'i / 'pangasius'i / 'pandalus'i / 'palmaria'i / 'pagellus'i / 'ovalipes'i / 'ostrea'i / 'oreochromis'i / 'oncorhynchus'i / 'octopus'i / 'nephrops'i / 'nemipterus'i / 'nelumbo'i / 'mytilus'i / 'mulinia'i / 'micromesistius'i / 'metapenaeus'i / 'merluccius'i / 'merlangius'i / 'melanogrammus'i / 'macruronus'i / 'macrocystis'i / 'lophius'i / 'loligo'i / 'litopenaeus'i / 'lithodes'i / 'limanda'i / 'lethrinus'i / 'lepidotrigla'i / 'lepidopsetta'i / 'lates'i / 'laminaria'i / 'katwonus'i / 'katwomus'i / 'katsuwonus'i / 'katsuwomus'i / 'illex'i / 'homarus'i / 'himanthalia'i / 'haematococcus'i / 'gracilaria'i / 'gelidium'i / 'gadus'i / 'fucus'i / 'euthynnus'i / 'ensis'i / 'engraulis'i / 'dunaliella'i / 'dosidicus'i / 'dicentrarchus'i / 'crassostrea'i / 'crangon'i / 'clupea'i / 'clarias'i / 'chondrus'i / 'chlorella'i / 'cerastoderma'i / 'caulerpa'i / 'ascophyllum'i / 'anguilla'i / 'anadara'i / 'alle'i / 'alaria'i / 'acipenser'i
|
13
13
|
end
|
14
14
|
|
15
15
|
rule fish_name_latin_second
|
16
|
-
'yessoensis'i / 'vulgaris'i / 'virens'i / 'vesiculosus'i / 'verrucosa'i / 'vannamei'i / 'undulata'i / 'umbilicalis'i / 'tenera'i / 'stylifera'i / 'sprattus'i / 'spp.'i / 'spp'i / 'sombrus'i / 'solea'i / 'solar'i / 'scombrus'i / 'santolla'i / 'salina'i / 'salar'i / 'ringens'i / 'pyrifera'i / 'pyrenoidosa'i / 'punctatus'i / 'productus'i / 'pluvialis'i / 'platessa'i / 'platensis'i / 'piscatorius'i / 'pinnatifida'i / 'pilchardus'i / 'pelatis'i / 'pelanis'i / 'pelamis'i / 'pelagicus'i / 'patagonica'i / 'pangasius'i / 'palmata'i / 'pacificus'i / 'officinalis'i / 'ocellatus'i / 'nucifera'i / 'novaezelandiae'i / 'novaezealandiae'i / 'norvegicus'i / 'nodosum'i / 'niloticus'i / 'nerka'i / 'mykiss'i / 'mykis'i / 'myki?s'i / 'murphyi'i / 'muelleri'i / 'morhue'i / 'morhua'i / 'monodon'i / 'monoceros'i / 'microptera'i / 'merluccius'i / 'merlangus'i / 'merguiensis'i / 'maximus'i / 'maxima'i / 'marinus'i / 'magellanicus'i / 'macrocephalus'i / 'limanda'i / 'lepturus'i / 'lentillifera'i / 'latissima'i / 'lactuca'i / 'labrax'i / 'kroyeri'i / 'kisutch'i / 'keta'i / '
|
16
|
+
'yessoensis'i / 'vulgaris'i / 'virens'i / 'vesiculosus'i / 'verrucosa'i / 'vannamei'i / 'undulata'i / 'umbilicalis'i / 'tenera'i / 'stylifera'i / 'sprattus'i / 'spp.'i / 'spp'i / 'sombrus'i / 'solea'i / 'solar'i / 'scombrus'i / 'santolla'i / 'salina'i / 'salar'i / 'ringens'i / 'pyrifera'i / 'pyrenoidosa'i / 'punctatus'i / 'productus'i / 'pluvialis'i / 'platessa'i / 'platensis'i / 'piscatorius'i / 'pinnatifida'i / 'pilchardus'i / 'pelatis'i / 'pelanis'i / 'pelamis'i / 'pelagicus'i / 'patagonica'i / 'pangasius'i / 'palmata'i / 'pacificus'i / 'officinalis'i / 'ocellatus'i / 'nucifera'i / 'novaezelandiae'i / 'novaezealandiae'i / 'norvegicus'i / 'nodosum'i / 'niloticus'i / 'nerka'i / 'mykiss'i / 'mykis'i / 'myki?s'i / 'murphyi'i / 'muelleri'i / 'morhue'i / 'morhua'i / 'monodon'i / 'monoceros'i / 'microptera'i / 'merluccius'i / 'merlangus'i / 'merguiensis'i / 'maximus'i / 'maxima'i / 'marinus'i / 'magellanicus'i / 'macrocephalus'i / 'limanda'i / 'lepturus'i / 'lentillifera'i / 'latissima'i / 'lactuca'i / 'labrax'i / 'kroyeri'i / 'kisutch'i / 'keta'i / 'jordani'i / 'japonicus'i / 'japonica'i / 'hyptophthalmus'i / 'hypophtalmus'i / 'hippoglossoides'i / 'hexodon'i / 'harengus'i / 'gueldenstaedtii'i / 'granulosa'i / 'gorbusche'i / 'gorbuscha'i / 'gladius'i / 'gigas'i / 'gibbosa'i / 'gariepinus'i / 'galloprovincialis'i / 'faber'i / 'esculenta'i / 'encrasicolus'i / 'elongata'i / 'edulis'i / 'edule'i / 'directus'i / 'digitata'i / 'crispus'i / 'crangon'i / 'clarkii'i / 'chilensis'i / 'chalcogramma'i / 'capensis'i / 'cannamei'i / 'canaliculus'i / 'canaliculata'i / 'calcareum'i / 'borealis'i / 'bogaraveo'i / 'bilineata'i / 'australis'i / 'aurata'i / 'argentinus'i / 'antiquata'i / 'anquilla'i / 'anguilla'i / 'anchoita'i / 'americanus'i / 'alle'i / 'albacares'i / 'alalunga'i / 'aeglefinus'i
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
2
|
grammar Root
|
3
3
|
include Common
|
4
4
|
include FishName
|
@@ -18,8 +18,10 @@ module FoodFishParser::Grammar
|
|
18
18
|
rule root
|
19
19
|
(
|
20
20
|
( fish_with_info ( ws* fish_sep ws* fish_with_info )* ) /
|
21
|
-
(
|
22
|
-
(
|
21
|
+
( fish_names_both ( ws* fish_sep ws* fish_names_both )* ) /
|
22
|
+
( fish_names_latin ( ws* fish_sep ws* fish_names_latin )* ) /
|
23
|
+
( fish_only_info ( ws* fish_sep ws* fish_only_info )* ) /
|
24
|
+
( fish_names_nl ( ws* fish_sep ws* fish_names_nl )* )
|
23
25
|
)
|
24
26
|
ws* '.'? ws*
|
25
27
|
<RootNode>
|
@@ -31,8 +33,10 @@ module FoodFishParser::Grammar
|
|
31
33
|
rule root_anywhere
|
32
34
|
(
|
33
35
|
( !fish_with_info . )* ( fish_with_info ( ( !fish_with_info . )* fish_with_info )* ) ( !fish_with_info . )* /
|
34
|
-
( !
|
35
|
-
( !
|
36
|
+
( !fish_names_both . )* ( fish_names_both ( ( !fish_names_both . )+ fish_names_both )* ) ( !fish_names_both . )* /
|
37
|
+
( !fish_names_latin . )* ( fish_names_latin ( ( !fish_names_latin . )+ fish_names_latin )* ) ( !fish_names_latin . )* /
|
38
|
+
( !fish_only_info . )* ( fish_only_info ( ( !fish_only_info . )+ fish_only_info )* ) ( !fish_only_info . )* /
|
39
|
+
( !fish_names_nl . )* ( fish_names_nl ( ( !fish_names_nl . )+ fish_names_nl )* ) ( !fish_names_nl . )*
|
36
40
|
)
|
37
41
|
<RootNode>
|
38
42
|
end
|
@@ -45,15 +49,25 @@ module FoodFishParser::Grammar
|
|
45
49
|
# fish with catch or aquaculture info
|
46
50
|
rule fish_with_info
|
47
51
|
(
|
48
|
-
(
|
49
|
-
(
|
52
|
+
( fish_name_any_list ( ws* ( comma / ':' ) )? ws+ fish_catch_info ) /
|
53
|
+
( fish_name_any_list ( ws* ( comma / ':' ) )? ws+ fish_aquac_info )
|
50
54
|
)
|
51
55
|
<FishNode>
|
52
56
|
end
|
53
57
|
|
54
|
-
# fish names
|
55
|
-
rule
|
56
|
-
|
58
|
+
# fish names common and latin
|
59
|
+
rule fish_names_both
|
60
|
+
fish_name_both_list <FishNode>
|
61
|
+
end
|
62
|
+
|
63
|
+
# fish names Latin only
|
64
|
+
rule fish_names_latin
|
65
|
+
fish_name_latin_list <FishNode>
|
66
|
+
end
|
67
|
+
|
68
|
+
# fish names NL only
|
69
|
+
rule fish_names_nl
|
70
|
+
fish_name_nl_list <FishNode>
|
57
71
|
end
|
58
72
|
|
59
73
|
# catch or aquaculture info only (no names)
|
File without changes
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'grammar'
|
2
|
+
|
3
|
+
module FoodFishParser
|
4
|
+
module Strict
|
5
|
+
class Parser
|
6
|
+
|
7
|
+
# @!attribute [r] parser
|
8
|
+
# @return [Treetop::Runtime::CompiledParser] low-level parser object
|
9
|
+
# @note This attribute is there for convenience, but may change in the future. Take care.
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
# Create a new fish detail parser
|
13
|
+
# @return [FoodFishParser::Strict::Parser]
|
14
|
+
def initialize
|
15
|
+
@parser = Grammar::RootParser.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parse food fish text into a structured representation.
|
19
|
+
#
|
20
|
+
# @param s [String] text to parse
|
21
|
+
# @param anywhere [Bool] +false+ assume the text is only fish details, +true+ to search for fish details in the text
|
22
|
+
# @return [FoodFishParser::Strict::Grammar::RootNode] structured representation of fish details
|
23
|
+
# @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
|
24
|
+
def parse(s, anywhere: false, **options)
|
25
|
+
if anywhere
|
26
|
+
options = options.merge(root: :root_anywhere, consume_all_input: false)
|
27
|
+
end
|
28
|
+
|
29
|
+
@parser.parse(s, **options)
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/food_fish_parser.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_fish_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -43,20 +43,26 @@ files:
|
|
43
43
|
- bin/food_fish_parser
|
44
44
|
- food_fish_parser.gemspec
|
45
45
|
- lib/food_fish_parser.rb
|
46
|
-
- lib/food_fish_parser/
|
47
|
-
- lib/food_fish_parser/
|
48
|
-
- lib/food_fish_parser/
|
49
|
-
- lib/food_fish_parser/
|
50
|
-
- lib/food_fish_parser/
|
51
|
-
- lib/food_fish_parser/
|
52
|
-
- lib/food_fish_parser/grammar
|
53
|
-
- lib/food_fish_parser/grammar/
|
54
|
-
- lib/food_fish_parser/grammar/
|
55
|
-
- lib/food_fish_parser/grammar/
|
56
|
-
- lib/food_fish_parser/grammar/
|
57
|
-
- lib/food_fish_parser/grammar/
|
58
|
-
- lib/food_fish_parser/
|
59
|
-
- lib/food_fish_parser/
|
46
|
+
- lib/food_fish_parser/flat/fao_region.rb
|
47
|
+
- lib/food_fish_parser/flat/fish_name.rb
|
48
|
+
- lib/food_fish_parser/flat/fish_name_latin.rb
|
49
|
+
- lib/food_fish_parser/flat/fish_name_nl.rb
|
50
|
+
- lib/food_fish_parser/flat/kind.rb
|
51
|
+
- lib/food_fish_parser/flat/parser.rb
|
52
|
+
- lib/food_fish_parser/strict/grammar.rb
|
53
|
+
- lib/food_fish_parser/strict/grammar/aquac_area.treetop
|
54
|
+
- lib/food_fish_parser/strict/grammar/aquac_method.treetop
|
55
|
+
- lib/food_fish_parser/strict/grammar/catch_area.treetop
|
56
|
+
- lib/food_fish_parser/strict/grammar/catch_method.treetop
|
57
|
+
- lib/food_fish_parser/strict/grammar/common.treetop
|
58
|
+
- lib/food_fish_parser/strict/grammar/fao_area.treetop
|
59
|
+
- lib/food_fish_parser/strict/grammar/fish_name.treetop
|
60
|
+
- lib/food_fish_parser/strict/grammar/fish_name_latin.treetop
|
61
|
+
- lib/food_fish_parser/strict/grammar/fish_name_nl.treetop
|
62
|
+
- lib/food_fish_parser/strict/grammar/root.treetop
|
63
|
+
- lib/food_fish_parser/strict/grammar/words.treetop
|
64
|
+
- lib/food_fish_parser/strict/nodes.rb
|
65
|
+
- lib/food_fish_parser/strict/parser.rb
|
60
66
|
- lib/food_fish_parser/version.rb
|
61
67
|
homepage: https://github.com/q-m/food-fish-parser-ruby
|
62
68
|
licenses:
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
2
|
-
grammar FishName
|
3
|
-
include Common
|
4
|
-
include FishNameLatin
|
5
|
-
include FishNameNL
|
6
|
-
|
7
|
-
rule fish_name
|
8
|
-
(
|
9
|
-
fish_name_nl ws* '(' ws* fish_name_latin ws* ')' /
|
10
|
-
fish_name_nl /
|
11
|
-
fish_name_latin
|
12
|
-
)
|
13
|
-
<FishNameNode>
|
14
|
-
end
|
15
|
-
|
16
|
-
rule fish_name_list
|
17
|
-
fish_name ( ws+ and_or ws+ fish_name )*
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require_relative 'grammar'
|
2
|
-
|
3
|
-
module FoodFishParser
|
4
|
-
class Parser
|
5
|
-
|
6
|
-
# @!attribute [r] parser
|
7
|
-
# @return [Treetop::Runtime::CompiledParser] low-level parser object
|
8
|
-
# @note This attribute is there for convenience, but may change in the future. Take care.
|
9
|
-
attr_reader :parser
|
10
|
-
|
11
|
-
# Create a new fish detail parser
|
12
|
-
# @return [FoodFishParser::Parser]
|
13
|
-
def initialize
|
14
|
-
@parser = Grammar::RootParser.new
|
15
|
-
end
|
16
|
-
|
17
|
-
# Parse food fish text into a structured representation.
|
18
|
-
#
|
19
|
-
# @param s [String] text to parse
|
20
|
-
# @param anywhere [Bool] +false+ assume the text is only fish details, +true+ to search for fish details in the text
|
21
|
-
# @return [FoodFishParser::Grammar::RootNode] structured representation of fish details
|
22
|
-
# @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
|
23
|
-
def parse(s, anywhere: false, **options)
|
24
|
-
if anywhere
|
25
|
-
options = options.merge(root: :root_anywhere, consume_all_input: false)
|
26
|
-
end
|
27
|
-
|
28
|
-
@parser.parse(s, **options)
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
end
|