relaton-ogc 1.9.1 → 1.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +19 -0
- data/lib/relaton_ogc/data_fetcher.rb +96 -0
- data/lib/relaton_ogc/hit_collection.rb +14 -37
- data/lib/relaton_ogc/processor.rb +14 -1
- data/lib/relaton_ogc/scrapper.rb +4 -2
- data/lib/relaton_ogc/version.rb +1 -1
- data/lib/relaton_ogc.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d9ddd36656e221ad7ceb82adf637197b15648451b42fbd78069bc02078082c05
|
4
|
+
data.tar.gz: 9179d1d896e37f26841c392701c500990eafbfdb1060a4120bb85189014d788e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ab0f2b5ed50fd8f218c92ae1a8589df818f92bb2480fffc450fbe930c16185cc9855d683dc4aafb5d34ad4f808ffd1cc44ba11b19adc6550d62dc029e7c0b11
|
7
|
+
data.tar.gz: c62733901e8f8790eaaccf90345503fd187bba8e8a08655400fdd77aa508df640ec7a4f11ef1146e41b0e456c538954769cbb4ef2d6eacba3b6b2cac73a70e01
|
data/README.adoc
CHANGED
@@ -113,6 +113,25 @@ RelatonOgc::OgcBibliographicItem.from_hash hash
|
|
113
113
|
...
|
114
114
|
----
|
115
115
|
|
116
|
+
=== Fetch data
|
117
|
+
|
118
|
+
This gem uses the https://raw.githubusercontent.com/opengeospatial/NamingAuthority/master/incubation/bibliography/bibliography.json dataset as a data sources.
|
119
|
+
|
120
|
+
The method `RelatonOgc::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the datast and save them to the `./data` folder in YAML format.
|
121
|
+
Arguments:
|
122
|
+
|
123
|
+
- `output` - folder to save documents (default './data').
|
124
|
+
- `format` - format in which the documents are saved. Possimle formats are: `yaml`, `xml` (default `yaml`).
|
125
|
+
|
126
|
+
[source,ruby]
|
127
|
+
----
|
128
|
+
RelatonOgc::DataFetcher.fetch
|
129
|
+
Started at: 2021-09-14 11:21:46 +0200
|
130
|
+
[relaton-ogc] WARNING Duplicated documents: 15-113r5, 08-094r1, 10-025r1, 12-128r14, 16-079, 16-007r3, 13-026r8, 12-128r12, 15-078r6, 12-176r7, 09-102r3, 14-095, 14-115, 07-147r2, 12-000, 12-006, 09-025r1, 07-036, 07-110r4, 03-105r1, 06-042, 07-165r1, 12-066, 06-104r4, 11-122r1, 09-000, 04-094, 07-006r1, 06-035r1, 03-006r3, 05-134, 04-021r3, 02-058, 01-009
|
131
|
+
Stopped at: 2021-09-14 11:21:48 +0200
|
132
|
+
=> nil
|
133
|
+
----
|
134
|
+
|
116
135
|
== Development
|
117
136
|
|
118
137
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module RelatonOgc
|
2
|
+
class DataFetcher
|
3
|
+
module Utils
|
4
|
+
ENDPOINT = "https://raw.githubusercontent.com/opengeospatial/"\
|
5
|
+
"NamingAuthority/master/incubation/bibliography/"\
|
6
|
+
"bibliography.json".freeze
|
7
|
+
|
8
|
+
def get_data # rubocop:disable Metrics/AbcSize
|
9
|
+
h = {}
|
10
|
+
h["If-None-Match"] = etag if etag
|
11
|
+
resp = Faraday.new(ENDPOINT, headers: h).get
|
12
|
+
case resp.status
|
13
|
+
when 200
|
14
|
+
json = JSON.parse(resp.body)
|
15
|
+
block_given? ? yield(resp[:etag], json) : json
|
16
|
+
when 304 then [] # there aren't any changes since last fetching
|
17
|
+
else raise RelatonBib::RequestError, "Could not access #{ENDPOINT}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Read ETag form file
|
23
|
+
#
|
24
|
+
# @return [String, NilClass]
|
25
|
+
def etag
|
26
|
+
@etag ||= if File.exist? @etagfile
|
27
|
+
File.read @etagfile, encoding: "UTF-8"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Save ETag to file
|
33
|
+
#
|
34
|
+
# @param tag [String]
|
35
|
+
def etag=(e_tag)
|
36
|
+
File.write @etagfile, e_tag, encoding: "UTF-8"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
include Utils
|
41
|
+
|
42
|
+
#
|
43
|
+
# Create DataFetcher instance
|
44
|
+
#
|
45
|
+
# @param [String] output directory to save the documents
|
46
|
+
# @param [String] format output format "yaml" or "xmo"
|
47
|
+
#
|
48
|
+
def initialize(output, format)
|
49
|
+
@output = output
|
50
|
+
@etagfile = File.join output, "etag.txt"
|
51
|
+
@format = format
|
52
|
+
@docids = []
|
53
|
+
@dupids = []
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.fetch(output: "data", format: "yaml")
|
57
|
+
t1 = Time.now
|
58
|
+
puts "Started at: #{t1}"
|
59
|
+
FileUtils.mkdir_p output unless Dir.exist? output
|
60
|
+
new(output, format).fetch
|
61
|
+
t2 = Time.now
|
62
|
+
puts "Stopped at: #{t2}"
|
63
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
64
|
+
end
|
65
|
+
|
66
|
+
def fetch # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
67
|
+
get_data do |etag, json|
|
68
|
+
no_errors = true
|
69
|
+
json.each do |_, hit|
|
70
|
+
bib = Scrapper.parse_page hit
|
71
|
+
write_document bib
|
72
|
+
rescue StandardError => e
|
73
|
+
no_errors = false
|
74
|
+
warn "Fetching document: #{hit['identifier']}"
|
75
|
+
warn "#{e.class} #{e.message}"
|
76
|
+
warn e.backtrace
|
77
|
+
end
|
78
|
+
warn "[relaton-ogc] WARNING Duplicated documents: #{@dupids.uniq.join(', ')}" if @dupids.any?
|
79
|
+
self.etag = etag if no_errors
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def write_document(bib) # rubocop:disable Metrics/AbcSize
|
84
|
+
if @docids.include?(bib.docidentifier[0].id)
|
85
|
+
@dupids << bib.docidentifier[0].id
|
86
|
+
return
|
87
|
+
end
|
88
|
+
|
89
|
+
@docids << bib.docidentifier[0].id
|
90
|
+
name = bib.docidentifier[0].id.upcase.gsub(/[\s:.]/, "_")
|
91
|
+
file = "#{@output}/#{name}.#{@format}"
|
92
|
+
content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
|
93
|
+
File.write file, content, encoding: "UTF-8"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -4,23 +4,25 @@ require "fileutils"
|
|
4
4
|
|
5
5
|
module RelatonOgc
|
6
6
|
class HitCollection < RelatonBib::HitCollection
|
7
|
-
|
8
|
-
|
7
|
+
include DataFetcher::Utils
|
8
|
+
|
9
|
+
# ENDPOINT = "https://raw.githubusercontent.com/opengeospatial/"\
|
10
|
+
# "NamingAuthority/master/incubation/bibliography/"\
|
11
|
+
# "bibliography.json".freeze
|
9
12
|
DATADIR = File.expand_path ".relaton/ogc/", Dir.home
|
10
13
|
DATAFILE = File.expand_path "bibliography.json", DATADIR
|
11
|
-
ETAGFILE = File.expand_path "etag.txt", DATADIR
|
14
|
+
# ETAGFILE = File.expand_path "etag.txt", DATADIR
|
12
15
|
|
13
16
|
# @param ref [Strig]
|
14
17
|
# @param year [String]
|
15
18
|
# @param opts [Hash]
|
16
19
|
def initialize(ref, year = nil)
|
17
20
|
super
|
21
|
+
@etagfile = File.expand_path "etag.txt", DATADIR
|
18
22
|
@array = from_json(ref).sort_by do |hit|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
Date.parse "0000-01-01"
|
23
|
-
end
|
23
|
+
hit.hit["date"] ? Date.parse(hit.hit["date"]) : Date.new
|
24
|
+
rescue ArgumentError
|
25
|
+
Date.parse "0000-01-01"
|
24
26
|
end.reverse
|
25
27
|
end
|
26
28
|
|
@@ -52,38 +54,13 @@ module RelatonOgc
|
|
52
54
|
#
|
53
55
|
# fetch data form server and save it to file.
|
54
56
|
#
|
55
|
-
def fetch_data
|
56
|
-
|
57
|
-
|
58
|
-
resp = Faraday.new(ENDPOINT, headers: h).get
|
59
|
-
# return if there aren't any changes since last fetching
|
60
|
-
return if resp.status == 304
|
61
|
-
unless resp.status == 200
|
62
|
-
raise RelatonBib::RequestError, "Could not access #{ENDPOINT}"
|
63
|
-
end
|
57
|
+
def fetch_data
|
58
|
+
json = get_data
|
59
|
+
return unless json
|
64
60
|
|
65
61
|
FileUtils.mkdir_p DATADIR unless Dir.exist? DATADIR
|
66
|
-
|
67
|
-
@data = JSON.parse resp.body
|
62
|
+
@data = json
|
68
63
|
File.write DATAFILE, @data.to_json, encoding: "UTF-8"
|
69
64
|
end
|
70
|
-
|
71
|
-
#
|
72
|
-
# Read ETag form file
|
73
|
-
#
|
74
|
-
# @return [String, NilClass]
|
75
|
-
def etag
|
76
|
-
@etag ||= if File.exist? ETAGFILE
|
77
|
-
File.read ETAGFILE, encoding: "UTF-8"
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
#
|
82
|
-
# Save ETag to file
|
83
|
-
#
|
84
|
-
# @param tag [String]
|
85
|
-
def etag=(e_tag)
|
86
|
-
File.write ETAGFILE, e_tag, encoding: "UTF-8"
|
87
|
-
end
|
88
65
|
end
|
89
66
|
end
|
@@ -2,11 +2,12 @@ require "relaton/processor"
|
|
2
2
|
|
3
3
|
module RelatonOgc
|
4
4
|
class Processor < Relaton::Processor
|
5
|
-
def initialize
|
5
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
6
6
|
@short = :relaton_ogc
|
7
7
|
@prefix = "OGC"
|
8
8
|
@defaultprefix = %r{^OGC\s}
|
9
9
|
@idtype = "OGC"
|
10
|
+
@datasets = %w[ogc-naming-authority]
|
10
11
|
end
|
11
12
|
|
12
13
|
# @param code [String]
|
@@ -17,6 +18,18 @@ module RelatonOgc
|
|
17
18
|
::RelatonOgc::OgcBibliography.get(code, date, opts)
|
18
19
|
end
|
19
20
|
|
21
|
+
#
|
22
|
+
# Fetch all the documents from a source
|
23
|
+
#
|
24
|
+
# @param [String] _source source name
|
25
|
+
# @param [Hash] opts
|
26
|
+
# @option opts [String] :output directory to output documents
|
27
|
+
# @option opts [String] :format
|
28
|
+
#
|
29
|
+
def fetch_data(_source, opts)
|
30
|
+
DataFetcher.fetch(**opts)
|
31
|
+
end
|
32
|
+
|
20
33
|
# @param xml [String]
|
21
34
|
# @return [RelatonOgc::OgcBibliographicItem]
|
22
35
|
def from_xml(xml)
|
data/lib/relaton_ogc/scrapper.rb
CHANGED
@@ -13,7 +13,7 @@ module RelatonOgc
|
|
13
13
|
"IPR" => { type: "engineering-report" },
|
14
14
|
"IS" => { type: "standard", subtype: "implementation" },
|
15
15
|
"ISC" => { type: "standard", subtype: "implementation" },
|
16
|
-
"ISx" => { type: "standard", subtype: "
|
16
|
+
"ISx" => { type: "standard", subtype: "extension" },
|
17
17
|
"Notes" => { type: "other" },
|
18
18
|
"ORM" => { type: "reference-model" },
|
19
19
|
"PC" => { type: "standard", subtype: "profile" },
|
@@ -88,7 +88,7 @@ module RelatonOgc
|
|
88
88
|
# @param stage [String]
|
89
89
|
# @return [RelatonBib::DocumentStatus, NilClass]
|
90
90
|
def fetch_status(stage)
|
91
|
-
stage && RelatonBib::
|
91
|
+
stage && RelatonBib::DocumentStatus.new(stage: stage)
|
92
92
|
end
|
93
93
|
|
94
94
|
# @param identifier [String]
|
@@ -138,6 +138,8 @@ module RelatonOgc
|
|
138
138
|
# @param date [String]
|
139
139
|
# @return [Array<RelatonBib::BibliographicDate>]
|
140
140
|
def fetch_date(date)
|
141
|
+
return [] unless date
|
142
|
+
|
141
143
|
[RelatonBib::BibliographicDate.new(type: "published", on: date)]
|
142
144
|
end
|
143
145
|
end
|
data/lib/relaton_ogc/version.rb
CHANGED
data/lib/relaton_ogc.rb
CHANGED
@@ -2,6 +2,7 @@ require "relaton_iso_bib"
|
|
2
2
|
require "relaton_ogc/version"
|
3
3
|
require "relaton_ogc/ogc_bibliographic_item"
|
4
4
|
require "relaton_ogc/ogc_bibliography"
|
5
|
+
require "relaton_ogc/data_fetcher"
|
5
6
|
require "relaton_ogc/hit_collection"
|
6
7
|
require "relaton_ogc/scrapper"
|
7
8
|
require "relaton_ogc/xml_parser"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-ogc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-09-
|
11
|
+
date: 2021-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -175,6 +175,7 @@ files:
|
|
175
175
|
- grammars/ogc.rng
|
176
176
|
- grammars/reqt.rng
|
177
177
|
- lib/relaton_ogc.rb
|
178
|
+
- lib/relaton_ogc/data_fetcher.rb
|
178
179
|
- lib/relaton_ogc/editorial_group.rb
|
179
180
|
- lib/relaton_ogc/hash_converter.rb
|
180
181
|
- lib/relaton_ogc/hit.rb
|