food_fish_parser 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +44 -2
- data/bin/food_fish_parser +11 -2
- data/lib/food_fish_parser/flat/fao_region.rb +35 -0
- data/lib/food_fish_parser/flat/fish_name.rb +34 -0
- data/lib/food_fish_parser/flat/fish_name_latin.rb +17 -0
- data/lib/food_fish_parser/flat/fish_name_nl.rb +20 -0
- data/lib/food_fish_parser/flat/kind.rb +41 -0
- data/lib/food_fish_parser/flat/parser.rb +52 -0
- data/lib/food_fish_parser/{grammar → strict/grammar}/aquac_area.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/aquac_method.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/catch_area.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/catch_method.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/common.treetop +1 -1
- data/lib/food_fish_parser/{grammar → strict/grammar}/fao_area.treetop +4 -3
- data/lib/food_fish_parser/strict/grammar/fish_name.treetop +30 -0
- data/lib/food_fish_parser/{grammar → strict/grammar}/fish_name_latin.treetop +4 -4
- data/lib/food_fish_parser/{grammar → strict/grammar}/fish_name_nl.treetop +2 -2
- data/lib/food_fish_parser/{grammar → strict/grammar}/root.treetop +24 -10
- data/lib/food_fish_parser/{grammar → strict/grammar}/words.treetop +1 -1
- data/lib/food_fish_parser/{grammar.rb → strict/grammar.rb} +0 -0
- data/lib/food_fish_parser/{nodes.rb → strict/nodes.rb} +1 -1
- data/lib/food_fish_parser/strict/parser.rb +34 -0
- data/lib/food_fish_parser/version.rb +2 -2
- data/lib/food_fish_parser.rb +2 -1
- metadata +22 -16
- data/lib/food_fish_parser/grammar/fish_name.treetop +0 -21
- data/lib/food_fish_parser/parser.rb +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e92312ff6eec700a2c57b1c48cad77fb9710dd3086b2fa4ded31e352d0bef9c5
|
4
|
+
data.tar.gz: 404c5692f4a731e1614c9c4f8628ddf01f599192740e205c4e9f064411d6e18a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2de059ea4900f0aea03194559b275f1c7b307c1c06dc6a0b02a1bf17c21bccdc5a8fce98ad6fa178ad99181ad0ab36d5a609c82edd960280c46efbc2f1e71cb7
|
7
|
+
data.tar.gz: 7fdc6287c1285c7868d784d771b0f983d0c4fb975e3fc82eeff52a2f80768b12455caebad5d487b4b1b7816126f33ec6c51e04e91d498bb128e500aa34e183d2
|
data/README.md
CHANGED
@@ -18,6 +18,8 @@ gem install food_fish_parser
|
|
18
18
|
|
19
19
|
## Example
|
20
20
|
|
21
|
+
### Strict parser
|
22
|
+
|
21
23
|
```ruby
|
22
24
|
require 'food_fish_parser'
|
23
25
|
|
@@ -28,7 +30,7 @@ s = <<EOT.gsub(/\n/, '').strip
|
|
28
30
|
kabeljauw (gadus macrocephalus), vangstgebied stille oceaan fao 67, garnaal
|
29
31
|
(litopenaeus vannamei), gekweekt in ecuador, kweekmethode: vijver.
|
30
32
|
EOT
|
31
|
-
parser = FoodFishParser::Parser.new
|
33
|
+
parser = FoodFishParser::Strict::Parser.new
|
32
34
|
puts parser.parse(s).to_a.inspect
|
33
35
|
```
|
34
36
|
|
@@ -74,13 +76,15 @@ Results in a list of detected fishes
|
|
74
76
|
]
|
75
77
|
```
|
76
78
|
|
79
|
+
### Anywhere
|
80
|
+
|
77
81
|
When you have a piece of text and don't know where (or if) any fish details are
|
78
82
|
present, you can use the `anywhere` option.
|
79
83
|
|
80
84
|
```ruby
|
81
85
|
require 'food_fish_parser'
|
82
86
|
|
83
|
-
parser = FoodFishParser::Parser.new
|
87
|
+
parser = FoodFishParser::Strict::Parser.new
|
84
88
|
s = "tomaat, vis (zalm (salmo salar) gevangen in Noorwegen), zout"
|
85
89
|
puts parser.parse(s, anywhere: true).to_a.inspect
|
86
90
|
```
|
@@ -104,6 +108,43 @@ While the parser would normally return nothing, with `anywhere` it returns:
|
|
104
108
|
|
105
109
|
Please note that the `anywhere` option can make the parser much slower.
|
106
110
|
|
111
|
+
### Flat parser
|
112
|
+
|
113
|
+
While the strict parser can recognize the structure of multiple fishes, it is really
|
114
|
+
strict about what it expects. Many cases are not recognized, or sometimes incomplete.
|
115
|
+
|
116
|
+
The flat parser does basic named entity recognition anywhere in the text. Any structure
|
117
|
+
is lost, so it always returns an array with one or zero items - but you get all the
|
118
|
+
FAO regions and fish names found.
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
require 'food_fish_parser'
|
122
|
+
|
123
|
+
parser = FoodFishParser::Flat::Parser.new
|
124
|
+
s = "Foobar zalm (salmo salar) *&! gevangen pangasius spp FAO 61 ?or ?FAO 67 what more.")
|
125
|
+
puts parser.parse(s).to_a.inspect
|
126
|
+
```
|
127
|
+
|
128
|
+
```
|
129
|
+
[
|
130
|
+
{
|
131
|
+
:names => [
|
132
|
+
{ :common=>"zalm", :latin=>"salmo salar" },
|
133
|
+
{ :common=>nil, :latin=>"pangasius spp" }
|
134
|
+
],
|
135
|
+
:catch_areas => [
|
136
|
+
{ :name=>nil, :fao_codes=>["61"] },
|
137
|
+
{ :name=>nil, :fao_codes=>["67"] }
|
138
|
+
],
|
139
|
+
:catch_methods => [],
|
140
|
+
:aquaculture_areas => [],
|
141
|
+
:aquaculture_methods => []
|
142
|
+
}
|
143
|
+
]
|
144
|
+
```
|
145
|
+
|
146
|
+
This might be expanded to more information at some point.
|
147
|
+
|
107
148
|
|
108
149
|
## Test tool
|
109
150
|
|
@@ -120,6 +161,7 @@ Usage: bin/food_fish_parser [options] --file|-f <filename>
|
|
120
161
|
-q, --[no-]quiet Only show summary.
|
121
162
|
-p, --parsed Only show lines that were successfully parsed.
|
122
163
|
-n, --noresult Only show lines that had no result.
|
164
|
+
-r, --parser PARSER Use specific parser (strict, flat).
|
123
165
|
-a, --[no-]anywhere Search for fish details anywhere in the text.
|
124
166
|
-e, --[no-]escape Escape newlines
|
125
167
|
-c, --[no-]color Use color
|
data/bin/food_fish_parser
CHANGED
@@ -72,9 +72,14 @@ verbosity = 1
|
|
72
72
|
files = []
|
73
73
|
strings = []
|
74
74
|
print = nil
|
75
|
+
parser_name = :strict
|
75
76
|
anywhere = false
|
76
77
|
escape = false
|
77
78
|
color = true
|
79
|
+
PARSERS = {
|
80
|
+
strict: FoodFishParser::Strict::Parser,
|
81
|
+
flat: FoodFishParser::Flat::Parser
|
82
|
+
}
|
78
83
|
OptionParser.new do |opts|
|
79
84
|
opts.banner = <<-EOF.gsub(/^ /, '')
|
80
85
|
Usage: #{$0} [options] --file|-f <filename>
|
@@ -88,7 +93,8 @@ OptionParser.new do |opts|
|
|
88
93
|
opts.on("-q", "--[no-]quiet", "Only show summary.") {|q| verbosity = q ? 0 : 1 }
|
89
94
|
opts.on("-p", "--parsed", "Only show lines that were successfully parsed.") {|p| print = :parsed }
|
90
95
|
opts.on("-n", "--noresult", "Only show lines that had no result.") {|p| print = :noresult }
|
91
|
-
opts.on("-
|
96
|
+
opts.on("-r", "--parser PARSER", "Use specific parser (#{PARSERS.keys.join(", ")}).") {|p| parser_name = p&.downcase&.to_sym }
|
97
|
+
opts.on("-a", "--[no-]anywhere", "Search for fish details anywhere in the text (only strict parser).") {|a| anywhere = !!a }
|
92
98
|
opts.on("-e", "--[no-]escape", "Escape newlines") {|e| escape = !!e }
|
93
99
|
opts.on("-c", "--[no-]color", "Use color") {|e| color = !!e }
|
94
100
|
opts.on("-v", "--[no-]verbose", "Show more data (parsed tree).") {|v| verbosity = v ? 2 : 1 }
|
@@ -103,7 +109,10 @@ OptionParser.new do |opts|
|
|
103
109
|
end.parse!
|
104
110
|
|
105
111
|
if strings.any? || files.any?
|
106
|
-
parser =
|
112
|
+
unless parser = PARSERS[parser_name]&.new
|
113
|
+
STDERR.puts("Please specify one of the known parsers: #{PARSERS.keys.join(", ")}.")
|
114
|
+
exit(1)
|
115
|
+
end
|
107
116
|
success = true
|
108
117
|
strings.each {|s| success &= parse_single(s, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
|
109
118
|
files.each {|f| success &= parse_file(f, parser: parser, verbosity: verbosity, print: print, escape: escape, color: color, anywhere: anywhere) }
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FoodFishParser
|
2
|
+
module Flat
|
3
|
+
module FaoRegion
|
4
|
+
|
5
|
+
REGEX_CODE = / 0?\d{2} (?: (?: -0?\d{2} | (?: \/ | - | \s+ | ) [lIV]+ ) [a-d]? )? /ix
|
6
|
+
REGEX = /
|
7
|
+
\b FA[O0]
|
8
|
+
\s* (?: -? (?:zone|gebied) )? \s* (?:nummer|nr\.?|,)?
|
9
|
+
\s*
|
10
|
+
(
|
11
|
+
#{REGEX_CODE}
|
12
|
+
(?:
|
13
|
+
(?: ,?\s*en\s* #{REGEX_CODE} ) |
|
14
|
+
(?: \/ #{REGEX_CODE} )+ |
|
15
|
+
(?: (?: ,\s* #{REGEX_CODE} )+ (?: ,?\s*en\s* #{REGEX_CODE} )? )
|
16
|
+
)?
|
17
|
+
)
|
18
|
+
\b
|
19
|
+
/ix
|
20
|
+
|
21
|
+
def self.find_all_code_groups(text)
|
22
|
+
text
|
23
|
+
.scan(REGEX)
|
24
|
+
.flatten(1)
|
25
|
+
.map {|s| s.split(/\s*(?: en | ,\s*en | , | \/ )\s*/xi) }
|
26
|
+
.map {|a| a.map {|s| s.gsub(/\A0(\d{2})\z/, '\1') } } # normalize trailing zeroes
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.find_all(text)
|
30
|
+
find_all_code_groups(text)
|
31
|
+
.map {|a| { name: nil, fao_codes: a } }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'fish_name_latin'
|
2
|
+
require_relative 'fish_name_nl'
|
3
|
+
|
4
|
+
module FoodFishParser
|
5
|
+
module Flat
|
6
|
+
module FishName
|
7
|
+
REGEX_VIS = /\s*(?: \(\s*vis\s*\) | \b,?\s*vis\s*,?\s*\b )/ix
|
8
|
+
|
9
|
+
REGEX = /
|
10
|
+
(?:
|
11
|
+
#{FishNameNL::REGEX} (?: #{REGEX_VIS} )? \s* \( \s* #{FishNameLatin::REGEX} \s* \) |
|
12
|
+
#{FishNameNL::REGEX} \s*,?\s* \b #{FishNameLatin::REGEX} |
|
13
|
+
#{FishNameLatin::REGEX} |
|
14
|
+
#{FishNameNL::REGEX}
|
15
|
+
)
|
16
|
+
/ix
|
17
|
+
|
18
|
+
def self.find_all(text)
|
19
|
+
# Because scan doesn't support named captures, we have to use numbered capture groups.
|
20
|
+
# Make sure to keep all groups you don't want to reference below as non-capturing groups.
|
21
|
+
# Each name regex has a capture group (so as to avoid noise), so you don't see them here.
|
22
|
+
# The order of the captures corresponds to the order of the fish names in the regex above.
|
23
|
+
text.scan(REGEX).map do |m|
|
24
|
+
case
|
25
|
+
when m[0] && m[1] then { common: m[0], latin: m[1] }
|
26
|
+
when m[2] && m[3] then { common: m[2], latin: m[3] }
|
27
|
+
when m[4] then { common: nil, latin: m[4] }
|
28
|
+
when m[5] then { common: m[5], latin: nil }
|
29
|
+
end
|
30
|
+
end.compact
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# autogenerated by species-treetop-gen-latin.rb on 2020-03-19
|
2
|
+
module FoodFishParser
|
3
|
+
module Flat
|
4
|
+
module FishNameLatin
|
5
|
+
REGEX_FIRST = /zygochlamys|zeus|xiphopenaeus|xiphias|undaria|ulva|trichiurus|trachurus|todarodes|thunnus|theragra|stolephorus|sprattus|spirulina|sparus|solea|sepiella|sepia|sebastes|scomber|sardinella|sardina|salmo|saccharina|reinhardtius|psetta|procambarus|portunus|porphyra|pollachius|pleuronectes|pleoticus|placopecten|phymatolithon|perna|penaeus|penaeidae|pelvetia|pecten|patinopecten|parapenaeopsis|paralomis|paphia|pangasius|pandalus|palmaria|pagellus|ovalipes|ostrea|oreochromis|oncorhynchus|octopus|nephrops|nemipterus|nelumbo|mytilus|mulinia|micromesistius|metapenaeus|merluccius|merlangius|melanogrammus|macruronus|macrocystis|lophius|loligo|litopenaeus|lithodes|limanda|lethrinus|lepidotrigla|lepidopsetta|lates|laminaria|katwonus|katwomus|katsuwonus|katsuwomus|illex|homarus|himanthalia|haematococcus|gracilaria|gelidium|gadus|fucus|euthynnus|ensis|engraulis|dunaliella|dosidicus|dicentrarchus|crassostrea|crangon|clupea|clarias|chondrus|chlorella|cerastoderma|caulerpa|ascophyllum|anguilla|anadara|alle|alaria|acipenser/i
|
6
|
+
REGEX_SECND = /yessoensis|vulgaris|virens|vesiculosus|verrucosa|vannamei|undulata|umbilicalis|tenera|stylifera|sprattus|spp\.|spp|sombrus|solea|solar|scombrus|santolla|salina|salar|ringens|pyrifera|pyrenoidosa|punctatus|productus|pluvialis|platessa|platensis|piscatorius|pinnatifida|pilchardus|pelatis|pelanis|pelamis|pelagicus|patagonica|pangasius|palmata|pacificus|officinalis|ocellatus|nucifera|novaezelandiae|novaezealandiae|norvegicus|nodosum|niloticus|nerka|mykiss|mykis|myki\?s|murphyi|muelleri|morhue|morhua|monodon|monoceros|microptera|merluccius|merlangus|merguiensis|maximus|maxima|marinus|magellanicus|macrocephalus|limanda|lepturus|lentillifera|latissima|lactuca|labrax|kroyeri|kisutch|keta|jordani|japonicus|japonica|hyptophthalmus|hypophtalmus|hippoglossoides|hexodon|harengus|gueldenstaedtii|granulosa|gorbusche|gorbuscha|gladius|gigas|gibbosa|gariepinus|galloprovincialis|faber|esculenta|encrasicolus|elongata|edulis|edule|directus|digitata|crispus|crangon|clarkii|chilensis|chalcogramma|capensis|cannamei|canaliculus|canaliculata|calcareum|borealis|bogaraveo|bilineata|australis|aurata|argentinus|antiquata|anquilla|anguilla|anchoita|americanus|alle|albacares|alalunga|aeglefinus/i
|
7
|
+
|
8
|
+
REGEX = /
|
9
|
+
\b
|
10
|
+
(
|
11
|
+
#{REGEX_FIRST} (?: \s+ #{REGEX_SECND} )?
|
12
|
+
)
|
13
|
+
\b
|
14
|
+
/ix
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# autogenerated by species-treetop-gen-nl.rb on 2020-03-19
|
2
|
+
module FoodFishParser
|
3
|
+
module Flat
|
4
|
+
module FishNameNL
|
5
|
+
REGEX_AREA = /pacifische|noorse|indische|groenlandse|atlantische|argentijnse|alaska/i
|
6
|
+
REGEX_ATTR = /zwarte|zwart|witte|witpoot|wit|roze|rood|rode|rivier|pijl|kleine|klein|grote|groot|groene|groen|doorn|coho|chum|blauwe|blauw/i
|
7
|
+
REGEX_NAME = /zonnevis|zeewolf|zeesnoek|zeekreeft|zeeforel|zeebaars|zalm|wijting|weekdieren|weekdier|vintonijn|tonijn\ albacore|tonijn|tong|tilapia|thon\ albacore|tarbot|tapijtschelp|sprot|spie|snotolf|snoekbaars|snoek|skipjack\ tonijn|schol|schelvis|schelpen|schelp|schar|sardines|regenboogforel|raat|poon|pollock|pollak|pangasius|paling|oogtonijn|mul|mosselen|mossel|meerval|mantelschelp|makreel|lom|leng|kreeft|koolvis|kokkel|karper|kabeljauw|hondstong|hoki|heilbot|heek|hake|haai|ha|gruis|griet|geep|geelvintonijn|garnalen|garnaal|fint|coquilles|cocquilles|botervis|bot|beekridder|barracuda|baars|arkschelp|ansjovis|albacore\ tonijn/i
|
8
|
+
REGEX_SUFX = /vlees|ringen|ring|filets|filet/i
|
9
|
+
|
10
|
+
REGEX = /
|
11
|
+
(?: \b verse \s+ )?
|
12
|
+
\b
|
13
|
+
(
|
14
|
+
(?: #{REGEX_AREA} \s+ )? (?: #{REGEX_ATTR} \s* )? #{REGEX_NAME} (?: #{REGEX_SUFX} )?
|
15
|
+
)
|
16
|
+
\b
|
17
|
+
/ix
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'fish_name_nl'
|
2
|
+
|
3
|
+
module FoodFishParser
|
4
|
+
module Flat
|
5
|
+
module Kind
|
6
|
+
|
7
|
+
REGEX_WILD = /
|
8
|
+
\b
|
9
|
+
(?:
|
10
|
+
gevangen |
|
11
|
+
visgebied |
|
12
|
+
vangstgebied |
|
13
|
+
vangsgebied |
|
14
|
+
betrapt \s+ bij |
|
15
|
+
wilde? \s+ #{FishNameNL::REGEX}
|
16
|
+
)
|
17
|
+
\b
|
18
|
+
/ix
|
19
|
+
|
20
|
+
REGEX_AQUAC = /
|
21
|
+
\b
|
22
|
+
(?:
|
23
|
+
gekweekt |
|
24
|
+
aquacultuurproduct |
|
25
|
+
aquacultuur \s+ product |
|
26
|
+
kweekmethode |
|
27
|
+
kweekmethoden
|
28
|
+
)
|
29
|
+
\b
|
30
|
+
/ix
|
31
|
+
|
32
|
+
def self.is_wild?(text)
|
33
|
+
!!REGEX_WILD.match(text)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.is_aquac?(text)
|
37
|
+
!!REGEX_AQUAC.match(text)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative 'fish_name'
|
2
|
+
require_relative 'fao_region'
|
3
|
+
require_relative 'kind'
|
4
|
+
|
5
|
+
module FoodFishParser
|
6
|
+
module Flat
|
7
|
+
class Parser
|
8
|
+
|
9
|
+
# Create a new fish detail parser
|
10
|
+
# @return [FoodFishParser::Flat::Parser]
|
11
|
+
def initialize
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse food fish text into a structured representation.
|
15
|
+
#
|
16
|
+
# @param s [String] text to parse
|
17
|
+
# @return [Array<Hash>] structured representation of fish details (maximum one item)
|
18
|
+
def parse(s, **options)
|
19
|
+
names = FishName.find_all(s)
|
20
|
+
areas = FaoRegion.find_all(s)
|
21
|
+
methods = [] # @todo
|
22
|
+
|
23
|
+
is_wild = Kind.is_wild?(s)
|
24
|
+
is_aquac = Kind.is_aquac?(s)
|
25
|
+
|
26
|
+
return [] unless names.any? || areas.any?
|
27
|
+
|
28
|
+
attrs = {
|
29
|
+
names: names,
|
30
|
+
catch_areas: nil,
|
31
|
+
catch_methods: nil,
|
32
|
+
aquaculture_areas: nil,
|
33
|
+
aquaculture_methods: nil
|
34
|
+
}
|
35
|
+
|
36
|
+
if is_wild && !is_aquac
|
37
|
+
[attrs.merge(catch_areas: areas, catch_methods: methods)]
|
38
|
+
elsif !is_wild && is_aquac
|
39
|
+
[attrs.merge(aquaculture_areas: areas, aquaculture_methods: methods)]
|
40
|
+
elsif areas.any? || methods.any?
|
41
|
+
# We have a problem: either there are multiple fish and they're a mix of
|
42
|
+
# wild and aquaculture fish, or there is no such indication at all.
|
43
|
+
# For now, we return it in a non-standard way (this needs to be tackled).
|
44
|
+
[attrs.merge(areas: areas, methods: methods)]
|
45
|
+
else
|
46
|
+
# just names
|
47
|
+
[attrs]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
2
|
grammar FaoArea
|
3
3
|
include Common
|
4
4
|
|
@@ -45,8 +45,9 @@ module FoodFishParser::Grammar
|
|
45
45
|
|
46
46
|
rule fao_area_sub_code
|
47
47
|
(
|
48
|
-
|
49
|
-
( dash
|
48
|
+
# IVX for roman numals, but l for incorrectly OCR-ed text
|
49
|
+
( ( dash / '/' / ws* )? [ivxIVXl]+ ) /
|
50
|
+
( dash [0-9]+ )
|
50
51
|
)
|
51
52
|
fao_area_suffix?
|
52
53
|
(
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
|
+
grammar FishName
|
3
|
+
include Common
|
4
|
+
include FishNameLatin
|
5
|
+
include FishNameNL
|
6
|
+
|
7
|
+
rule fish_name_both
|
8
|
+
( fish_name_nl ws* '(' ws* fish_name_latin ( ws* ')' / comma )? )
|
9
|
+
end
|
10
|
+
|
11
|
+
rule fish_name_both_list
|
12
|
+
( fish_name_both <FishNameNode> )
|
13
|
+
( ws+ and_or ws+ fish_name_both <FishNameNode> )*
|
14
|
+
end
|
15
|
+
|
16
|
+
rule fish_name_latin_list
|
17
|
+
( fish_name_latin <FishNameNode> )
|
18
|
+
( ws+ and_or ws+ fish_name_latin <FishNameNode> )*
|
19
|
+
end
|
20
|
+
|
21
|
+
rule fish_name_nl_list
|
22
|
+
( fish_name_nl <FishNameNode> )
|
23
|
+
( ws+ and_or ws+ fish_name_nl <FishNameNode> )*
|
24
|
+
end
|
25
|
+
|
26
|
+
rule fish_name_any_list
|
27
|
+
fish_name_both_list / fish_name_latin_list / fish_name_nl_list
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
# autogenerated by species-treetop-gen-latin.rb on 2020-03-
|
2
|
-
module FoodFishParser::Grammar
|
1
|
+
# autogenerated by species-treetop-gen-latin.rb on 2020-03-19
|
2
|
+
module FoodFishParser::Strict::Grammar
|
3
3
|
grammar FishNameLatin
|
4
4
|
include Common
|
5
5
|
|
@@ -9,11 +9,11 @@ module FoodFishParser::Grammar
|
|
9
9
|
end
|
10
10
|
|
11
11
|
rule fish_name_latin_first
|
12
|
-
'zygochlamys'i / 'zeus'i / 'xiphopenaeus'i / 'xiphias'i / 'undaria'i / 'ulva'i / 'trichiurus'i / 'trachurus'i / 'todarodes'i / 'thunnus'i / 'theragra'i / 'stolephorus'i / 'sprattus'i / 'spirulina'i / 'sparus'i / 'solea'i / 'sepiella'i / 'sepia'i / 'sebastes'i / 'scomber'i / 'sardinella'i / 'sardina'i / 'salmo'i / 'saccharina'i / 'reinhardtius'i / 'psetta'i / 'procambarus'i / 'portunus'i / 'porphyra'i / 'pollachius'i / 'pleuronectes'i / 'pleoticus'i / 'placopecten'i / 'phymatolithon'i / 'perna'i / 'penaeus'i / 'penaeidae'i / 'pelvetia'i / 'pecten'i / 'patinopecten'i / 'parapenaeopsis'i / 'paralomis'i / 'paphia'i / 'pangasius'i / 'pandalus'i / 'palmaria'i / 'pagellus'i / '
|
12
|
+
'zygochlamys'i / 'zeus'i / 'xiphopenaeus'i / 'xiphias'i / 'undaria'i / 'ulva'i / 'trichiurus'i / 'trachurus'i / 'todarodes'i / 'thunnus'i / 'theragra'i / 'stolephorus'i / 'sprattus'i / 'spirulina'i / 'sparus'i / 'solea'i / 'sepiella'i / 'sepia'i / 'sebastes'i / 'scomber'i / 'sardinella'i / 'sardina'i / 'salmo'i / 'saccharina'i / 'reinhardtius'i / 'psetta'i / 'procambarus'i / 'portunus'i / 'porphyra'i / 'pollachius'i / 'pleuronectes'i / 'pleoticus'i / 'placopecten'i / 'phymatolithon'i / 'perna'i / 'penaeus'i / 'penaeidae'i / 'pelvetia'i / 'pecten'i / 'patinopecten'i / 'parapenaeopsis'i / 'paralomis'i / 'paphia'i / 'pangasius'i / 'pandalus'i / 'palmaria'i / 'pagellus'i / 'ovalipes'i / 'ostrea'i / 'oreochromis'i / 'oncorhynchus'i / 'octopus'i / 'nephrops'i / 'nemipterus'i / 'nelumbo'i / 'mytilus'i / 'mulinia'i / 'micromesistius'i / 'metapenaeus'i / 'merluccius'i / 'merlangius'i / 'melanogrammus'i / 'macruronus'i / 'macrocystis'i / 'lophius'i / 'loligo'i / 'litopenaeus'i / 'lithodes'i / 'limanda'i / 'lethrinus'i / 'lepidotrigla'i / 'lepidopsetta'i / 'lates'i / 'laminaria'i / 'katwonus'i / 'katwomus'i / 'katsuwonus'i / 'katsuwomus'i / 'illex'i / 'homarus'i / 'himanthalia'i / 'haematococcus'i / 'gracilaria'i / 'gelidium'i / 'gadus'i / 'fucus'i / 'euthynnus'i / 'ensis'i / 'engraulis'i / 'dunaliella'i / 'dosidicus'i / 'dicentrarchus'i / 'crassostrea'i / 'crangon'i / 'clupea'i / 'clarias'i / 'chondrus'i / 'chlorella'i / 'cerastoderma'i / 'caulerpa'i / 'ascophyllum'i / 'anguilla'i / 'anadara'i / 'alle'i / 'alaria'i / 'acipenser'i
|
13
13
|
end
|
14
14
|
|
15
15
|
rule fish_name_latin_second
|
16
|
-
'yessoensis'i / 'vulgaris'i / 'virens'i / 'vesiculosus'i / 'verrucosa'i / 'vannamei'i / 'undulata'i / 'umbilicalis'i / 'tenera'i / 'stylifera'i / 'sprattus'i / 'spp.'i / 'spp'i / 'sombrus'i / 'solea'i / 'solar'i / 'scombrus'i / 'santolla'i / 'salina'i / 'salar'i / 'ringens'i / 'pyrifera'i / 'pyrenoidosa'i / 'punctatus'i / 'productus'i / 'pluvialis'i / 'platessa'i / 'platensis'i / 'piscatorius'i / 'pinnatifida'i / 'pilchardus'i / 'pelatis'i / 'pelanis'i / 'pelamis'i / 'pelagicus'i / 'patagonica'i / 'pangasius'i / 'palmata'i / 'pacificus'i / 'officinalis'i / 'ocellatus'i / 'nucifera'i / 'novaezelandiae'i / 'novaezealandiae'i / 'norvegicus'i / 'nodosum'i / 'niloticus'i / 'nerka'i / 'mykiss'i / 'mykis'i / 'myki?s'i / 'murphyi'i / 'muelleri'i / 'morhue'i / 'morhua'i / 'monodon'i / 'monoceros'i / 'microptera'i / 'merluccius'i / 'merlangus'i / 'merguiensis'i / 'maximus'i / 'maxima'i / 'marinus'i / 'magellanicus'i / 'macrocephalus'i / 'limanda'i / 'lepturus'i / 'lentillifera'i / 'latissima'i / 'lactuca'i / 'labrax'i / 'kroyeri'i / 'kisutch'i / 'keta'i / '
|
16
|
+
'yessoensis'i / 'vulgaris'i / 'virens'i / 'vesiculosus'i / 'verrucosa'i / 'vannamei'i / 'undulata'i / 'umbilicalis'i / 'tenera'i / 'stylifera'i / 'sprattus'i / 'spp.'i / 'spp'i / 'sombrus'i / 'solea'i / 'solar'i / 'scombrus'i / 'santolla'i / 'salina'i / 'salar'i / 'ringens'i / 'pyrifera'i / 'pyrenoidosa'i / 'punctatus'i / 'productus'i / 'pluvialis'i / 'platessa'i / 'platensis'i / 'piscatorius'i / 'pinnatifida'i / 'pilchardus'i / 'pelatis'i / 'pelanis'i / 'pelamis'i / 'pelagicus'i / 'patagonica'i / 'pangasius'i / 'palmata'i / 'pacificus'i / 'officinalis'i / 'ocellatus'i / 'nucifera'i / 'novaezelandiae'i / 'novaezealandiae'i / 'norvegicus'i / 'nodosum'i / 'niloticus'i / 'nerka'i / 'mykiss'i / 'mykis'i / 'myki?s'i / 'murphyi'i / 'muelleri'i / 'morhue'i / 'morhua'i / 'monodon'i / 'monoceros'i / 'microptera'i / 'merluccius'i / 'merlangus'i / 'merguiensis'i / 'maximus'i / 'maxima'i / 'marinus'i / 'magellanicus'i / 'macrocephalus'i / 'limanda'i / 'lepturus'i / 'lentillifera'i / 'latissima'i / 'lactuca'i / 'labrax'i / 'kroyeri'i / 'kisutch'i / 'keta'i / 'jordani'i / 'japonicus'i / 'japonica'i / 'hyptophthalmus'i / 'hypophtalmus'i / 'hippoglossoides'i / 'hexodon'i / 'harengus'i / 'gueldenstaedtii'i / 'granulosa'i / 'gorbusche'i / 'gorbuscha'i / 'gladius'i / 'gigas'i / 'gibbosa'i / 'gariepinus'i / 'galloprovincialis'i / 'faber'i / 'esculenta'i / 'encrasicolus'i / 'elongata'i / 'edulis'i / 'edule'i / 'directus'i / 'digitata'i / 'crispus'i / 'crangon'i / 'clarkii'i / 'chilensis'i / 'chalcogramma'i / 'capensis'i / 'cannamei'i / 'canaliculus'i / 'canaliculata'i / 'calcareum'i / 'borealis'i / 'bogaraveo'i / 'bilineata'i / 'australis'i / 'aurata'i / 'argentinus'i / 'antiquata'i / 'anquilla'i / 'anguilla'i / 'anchoita'i / 'americanus'i / 'alle'i / 'albacares'i / 'alalunga'i / 'aeglefinus'i
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
1
|
+
module FoodFishParser::Strict::Grammar
|
2
2
|
grammar Root
|
3
3
|
include Common
|
4
4
|
include FishName
|
@@ -18,8 +18,10 @@ module FoodFishParser::Grammar
|
|
18
18
|
rule root
|
19
19
|
(
|
20
20
|
( fish_with_info ( ws* fish_sep ws* fish_with_info )* ) /
|
21
|
-
(
|
22
|
-
(
|
21
|
+
( fish_names_both ( ws* fish_sep ws* fish_names_both )* ) /
|
22
|
+
( fish_names_latin ( ws* fish_sep ws* fish_names_latin )* ) /
|
23
|
+
( fish_only_info ( ws* fish_sep ws* fish_only_info )* ) /
|
24
|
+
( fish_names_nl ( ws* fish_sep ws* fish_names_nl )* )
|
23
25
|
)
|
24
26
|
ws* '.'? ws*
|
25
27
|
<RootNode>
|
@@ -31,8 +33,10 @@ module FoodFishParser::Grammar
|
|
31
33
|
rule root_anywhere
|
32
34
|
(
|
33
35
|
( !fish_with_info . )* ( fish_with_info ( ( !fish_with_info . )* fish_with_info )* ) ( !fish_with_info . )* /
|
34
|
-
( !
|
35
|
-
( !
|
36
|
+
( !fish_names_both . )* ( fish_names_both ( ( !fish_names_both . )+ fish_names_both )* ) ( !fish_names_both . )* /
|
37
|
+
( !fish_names_latin . )* ( fish_names_latin ( ( !fish_names_latin . )+ fish_names_latin )* ) ( !fish_names_latin . )* /
|
38
|
+
( !fish_only_info . )* ( fish_only_info ( ( !fish_only_info . )+ fish_only_info )* ) ( !fish_only_info . )* /
|
39
|
+
( !fish_names_nl . )* ( fish_names_nl ( ( !fish_names_nl . )+ fish_names_nl )* ) ( !fish_names_nl . )*
|
36
40
|
)
|
37
41
|
<RootNode>
|
38
42
|
end
|
@@ -45,15 +49,25 @@ module FoodFishParser::Grammar
|
|
45
49
|
# fish with catch or aquaculture info
|
46
50
|
rule fish_with_info
|
47
51
|
(
|
48
|
-
(
|
49
|
-
(
|
52
|
+
( fish_name_any_list ( ws* ( comma / ':' ) )? ws+ fish_catch_info ) /
|
53
|
+
( fish_name_any_list ( ws* ( comma / ':' ) )? ws+ fish_aquac_info )
|
50
54
|
)
|
51
55
|
<FishNode>
|
52
56
|
end
|
53
57
|
|
54
|
-
# fish names
|
55
|
-
rule
|
56
|
-
|
58
|
+
# fish names common and latin
|
59
|
+
rule fish_names_both
|
60
|
+
fish_name_both_list <FishNode>
|
61
|
+
end
|
62
|
+
|
63
|
+
# fish names Latin only
|
64
|
+
rule fish_names_latin
|
65
|
+
fish_name_latin_list <FishNode>
|
66
|
+
end
|
67
|
+
|
68
|
+
# fish names NL only
|
69
|
+
rule fish_names_nl
|
70
|
+
fish_name_nl_list <FishNode>
|
57
71
|
end
|
58
72
|
|
59
73
|
# catch or aquaculture info only (no names)
|
File without changes
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require_relative 'grammar'
|
2
|
+
|
3
|
+
module FoodFishParser
|
4
|
+
module Strict
|
5
|
+
class Parser
|
6
|
+
|
7
|
+
# @!attribute [r] parser
|
8
|
+
# @return [Treetop::Runtime::CompiledParser] low-level parser object
|
9
|
+
# @note This attribute is there for convenience, but may change in the future. Take care.
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
# Create a new fish detail parser
|
13
|
+
# @return [FoodFishParser::Strict::Parser]
|
14
|
+
def initialize
|
15
|
+
@parser = Grammar::RootParser.new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parse food fish text into a structured representation.
|
19
|
+
#
|
20
|
+
# @param s [String] text to parse
|
21
|
+
# @param anywhere [Bool] +false+ assume the text is only fish details, +true+ to search for fish details in the text
|
22
|
+
# @return [FoodFishParser::Strict::Grammar::RootNode] structured representation of fish details
|
23
|
+
# @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
|
24
|
+
def parse(s, anywhere: false, **options)
|
25
|
+
if anywhere
|
26
|
+
options = options.merge(root: :root_anywhere, consume_all_input: false)
|
27
|
+
end
|
28
|
+
|
29
|
+
@parser.parse(s, **options)
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/food_fish_parser.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: food_fish_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- wvengen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: treetop
|
@@ -43,20 +43,26 @@ files:
|
|
43
43
|
- bin/food_fish_parser
|
44
44
|
- food_fish_parser.gemspec
|
45
45
|
- lib/food_fish_parser.rb
|
46
|
-
- lib/food_fish_parser/
|
47
|
-
- lib/food_fish_parser/
|
48
|
-
- lib/food_fish_parser/
|
49
|
-
- lib/food_fish_parser/
|
50
|
-
- lib/food_fish_parser/
|
51
|
-
- lib/food_fish_parser/
|
52
|
-
- lib/food_fish_parser/grammar
|
53
|
-
- lib/food_fish_parser/grammar/
|
54
|
-
- lib/food_fish_parser/grammar/
|
55
|
-
- lib/food_fish_parser/grammar/
|
56
|
-
- lib/food_fish_parser/grammar/
|
57
|
-
- lib/food_fish_parser/grammar/
|
58
|
-
- lib/food_fish_parser/
|
59
|
-
- lib/food_fish_parser/
|
46
|
+
- lib/food_fish_parser/flat/fao_region.rb
|
47
|
+
- lib/food_fish_parser/flat/fish_name.rb
|
48
|
+
- lib/food_fish_parser/flat/fish_name_latin.rb
|
49
|
+
- lib/food_fish_parser/flat/fish_name_nl.rb
|
50
|
+
- lib/food_fish_parser/flat/kind.rb
|
51
|
+
- lib/food_fish_parser/flat/parser.rb
|
52
|
+
- lib/food_fish_parser/strict/grammar.rb
|
53
|
+
- lib/food_fish_parser/strict/grammar/aquac_area.treetop
|
54
|
+
- lib/food_fish_parser/strict/grammar/aquac_method.treetop
|
55
|
+
- lib/food_fish_parser/strict/grammar/catch_area.treetop
|
56
|
+
- lib/food_fish_parser/strict/grammar/catch_method.treetop
|
57
|
+
- lib/food_fish_parser/strict/grammar/common.treetop
|
58
|
+
- lib/food_fish_parser/strict/grammar/fao_area.treetop
|
59
|
+
- lib/food_fish_parser/strict/grammar/fish_name.treetop
|
60
|
+
- lib/food_fish_parser/strict/grammar/fish_name_latin.treetop
|
61
|
+
- lib/food_fish_parser/strict/grammar/fish_name_nl.treetop
|
62
|
+
- lib/food_fish_parser/strict/grammar/root.treetop
|
63
|
+
- lib/food_fish_parser/strict/grammar/words.treetop
|
64
|
+
- lib/food_fish_parser/strict/nodes.rb
|
65
|
+
- lib/food_fish_parser/strict/parser.rb
|
60
66
|
- lib/food_fish_parser/version.rb
|
61
67
|
homepage: https://github.com/q-m/food-fish-parser-ruby
|
62
68
|
licenses:
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module FoodFishParser::Grammar
|
2
|
-
grammar FishName
|
3
|
-
include Common
|
4
|
-
include FishNameLatin
|
5
|
-
include FishNameNL
|
6
|
-
|
7
|
-
rule fish_name
|
8
|
-
(
|
9
|
-
fish_name_nl ws* '(' ws* fish_name_latin ws* ')' /
|
10
|
-
fish_name_nl /
|
11
|
-
fish_name_latin
|
12
|
-
)
|
13
|
-
<FishNameNode>
|
14
|
-
end
|
15
|
-
|
16
|
-
rule fish_name_list
|
17
|
-
fish_name ( ws+ and_or ws+ fish_name )*
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require_relative 'grammar'
|
2
|
-
|
3
|
-
module FoodFishParser
|
4
|
-
class Parser
|
5
|
-
|
6
|
-
# @!attribute [r] parser
|
7
|
-
# @return [Treetop::Runtime::CompiledParser] low-level parser object
|
8
|
-
# @note This attribute is there for convenience, but may change in the future. Take care.
|
9
|
-
attr_reader :parser
|
10
|
-
|
11
|
-
# Create a new fish detail parser
|
12
|
-
# @return [FoodFishParser::Parser]
|
13
|
-
def initialize
|
14
|
-
@parser = Grammar::RootParser.new
|
15
|
-
end
|
16
|
-
|
17
|
-
# Parse food fish text into a structured representation.
|
18
|
-
#
|
19
|
-
# @param s [String] text to parse
|
20
|
-
# @param anywhere [Bool] +false+ assume the text is only fish details, +true+ to search for fish details in the text
|
21
|
-
# @return [FoodFishParser::Grammar::RootNode] structured representation of fish details
|
22
|
-
# @note Unrecognized options are passed to Treetop, but this is not guarenteed to remain so forever.
|
23
|
-
def parse(s, anywhere: false, **options)
|
24
|
-
if anywhere
|
25
|
-
options = options.merge(root: :root_anywhere, consume_all_input: false)
|
26
|
-
end
|
27
|
-
|
28
|
-
@parser.parse(s, **options)
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
end
|