relaton-w3c 1.19.0 → 1.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +2 -5
- data/lib/relaton_w3c/data_fetcher.rb +45 -164
- data/lib/relaton_w3c/data_parser.rb +86 -211
- data/lib/relaton_w3c/processor.rb +3 -3
- data/lib/relaton_w3c/rate_limit_handler.rb +32 -0
- data/lib/relaton_w3c/version.rb +1 -1
- data/relaton_w3c.gemspec +2 -9
- metadata +12 -110
- data/lib/relaton_w3c/workgroups.yaml +0 -508
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: beb3bb1a8550059e754f001412336b78ab55771f25ebf33534dd92f59a724b9d
|
4
|
+
data.tar.gz: d32d55f604c16613c04d5d2ad5554e7c7ed8c6151ad0218411c47d58099b5a63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b2927bebbb35dde93cb2659b44e0ef47608ffa164ae33d138041a46139ca17b1a62061eb1055959fe91771bf397bf5ecb11f9ff2ca19055d7d804a65962a8974
|
7
|
+
data.tar.gz: 240f1c9f84770eb129a372a015b582c7f3ff6870eaf83e2dd723ff9538bbc19da34d58190234441b3c6b9dde224810e09c5e287473afa1183f23c53deabcc849
|
data/README.adoc
CHANGED
@@ -108,16 +108,13 @@ RelatonW3c::W3cBibliographicItem.new **bib_hash
|
|
108
108
|
|
109
109
|
=== Fetch data
|
110
110
|
|
111
|
-
The method `RelatonW3c::DataFetcher.fetch(
|
111
|
+
The method `RelatonW3c::DataFetcher.fetch(output: "data", format: "yaml")` converts all the documents from the dataset and saves them to the `./data` folder in YAML format.
|
112
112
|
Arguments:
|
113
113
|
|
114
|
-
- `source` - the name of the dataset (`w3c-rdf` or `w3c-tr-archive`)
|
115
114
|
- `output` - folder to save documents (default './data').
|
116
115
|
- `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
|
117
116
|
|
118
|
-
The
|
119
|
-
- `w3c-rdf` - The dataset is fetched from http://www.w3.org/2002/01/tr-automation/tr.rdf.
|
120
|
-
- `w3c-tr-archive` - The archive dataset files should be downloaded from https://github.com/relaton/w3c-tr-archive repository and placed into `w3c-tr-archive` folder.
|
117
|
+
The method uses https://api.w3.org/doc API to fetch all the W3C documents.
|
121
118
|
|
122
119
|
[source,ruby]
|
123
120
|
----
|
@@ -1,12 +1,10 @@
|
|
1
|
-
require "
|
2
|
-
|
3
|
-
|
4
|
-
require "mechanize"
|
5
|
-
require "relaton_w3c/data_parser"
|
1
|
+
require "w3c_api"
|
2
|
+
require_relative "rate_limit_handler"
|
3
|
+
require_relative "data_parser"
|
6
4
|
|
7
5
|
module RelatonW3c
|
8
6
|
class DataFetcher
|
9
|
-
|
7
|
+
include RelatonW3c::RateLimitHandler
|
10
8
|
|
11
9
|
#
|
12
10
|
# Data fetcher initializer
|
@@ -18,8 +16,8 @@ module RelatonW3c
|
|
18
16
|
@output = output
|
19
17
|
@format = format
|
20
18
|
@ext = format.sub(/^bib/, "")
|
21
|
-
|
22
|
-
@
|
19
|
+
@files = Set.new
|
20
|
+
@fetched_urls = {}
|
23
21
|
@index = DataIndex.create_from_file
|
24
22
|
@index1 = Relaton::Index.find_or_create :W3C, file: "index1.yaml"
|
25
23
|
end
|
@@ -27,178 +25,59 @@ module RelatonW3c
|
|
27
25
|
#
|
28
26
|
# Initialize fetcher and run fetch
|
29
27
|
#
|
30
|
-
# @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
|
31
28
|
# @param [Strin] output directory to save files, default: "data"
|
32
29
|
# @param [Strin] format format of output files (xml, yaml, bibxml), default: yaml
|
33
30
|
#
|
34
|
-
def self.fetch(
|
31
|
+
def self.fetch(output: "data", format: "yaml")
|
35
32
|
t1 = Time.now
|
36
33
|
puts "Started at: #{t1}"
|
37
34
|
FileUtils.mkdir_p output
|
38
|
-
new(output, format).fetch
|
35
|
+
new(output, format).fetch
|
39
36
|
t2 = Time.now
|
40
37
|
puts "Stopped at: #{t2}"
|
41
38
|
puts "Done in: #{(t2 - t1).round} sec."
|
42
39
|
end
|
43
40
|
|
44
|
-
|
45
|
-
|
46
|
-
#
|
47
|
-
# @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
|
48
|
-
#
|
49
|
-
def fetch(source) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
50
|
-
each_dataset(source) do |rdf|
|
51
|
-
%i[versioned unversioned].each do |type|
|
52
|
-
send("query_#{type}_docs", rdf).each do |sl|
|
53
|
-
bib = DataParser.parse(rdf, sl, self)
|
54
|
-
add_has_edition_relation(bib) if type == :unversioned
|
55
|
-
save_doc bib
|
56
|
-
rescue StandardError => e
|
57
|
-
link = sl.respond_to?(:link) ? sl.link : sl.version_of
|
58
|
-
Util.error "Error: document #{link} #{e.message}\n#{e.backtrace.join("\n")}"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
@index.sort!.save
|
63
|
-
@index1.save
|
41
|
+
def client
|
42
|
+
@client ||= W3cApi::Client.new
|
64
43
|
end
|
65
44
|
|
66
45
|
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
# @param [RelatonW3c::W3cBibliographicItem] bib bibligraphic item
|
46
|
+
# Parse documents
|
70
47
|
#
|
71
|
-
def
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
r1.type = "hasEdition" if r1.type == "instanceOf"
|
77
|
-
same_edition = bib.relation.detect { |r2| same_edition?(r1, r2) }
|
78
|
-
bib.relation << r1 unless same_edition
|
48
|
+
def fetch
|
49
|
+
specs = client.specifications
|
50
|
+
loop do
|
51
|
+
specs.links.specifications.each do |spec|
|
52
|
+
fetch_spec spec
|
79
53
|
end
|
80
|
-
end
|
81
|
-
bib.relation.select { |r| r.type == "hasEdition" }
|
82
|
-
.max_by { |r| r.bibitem.id.match(/(?<=-)\d{8}$/).to_s }&.type = "instanceOf"
|
83
|
-
end
|
84
54
|
|
85
|
-
|
86
|
-
# Read XML file
|
87
|
-
#
|
88
|
-
# @param [String] file file name
|
89
|
-
#
|
90
|
-
# @return [RelatonW3c::W3cBibliographicItem] bibliographic item
|
91
|
-
#
|
92
|
-
def read_xml(file)
|
93
|
-
XMLParser.from_xml(File.read(file, encoding: "UTF-8"))
|
94
|
-
end
|
95
|
-
|
96
|
-
#
|
97
|
-
# Read YAML file
|
98
|
-
#
|
99
|
-
# @param [String] file file name
|
100
|
-
#
|
101
|
-
# @return [RelatonW3c::W3cBibliographicItem] bibliographic item
|
102
|
-
#
|
103
|
-
def read_yaml(file)
|
104
|
-
hash = YAML.load_file(file)
|
105
|
-
W3cBibliographicItem.from_hash(hash)
|
106
|
-
end
|
55
|
+
break unless specs.next?
|
107
56
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
#
|
113
|
-
# @return [RelatonW3c::W3cBibliographicItem] bibliographic item
|
114
|
-
#
|
115
|
-
def read_bibxml(file)
|
116
|
-
BibXMLParser.parse File.read(file, encoding: "UTF-8")
|
57
|
+
specs = specs.next
|
58
|
+
end
|
59
|
+
@index.sort!.save
|
60
|
+
@index1.save
|
117
61
|
end
|
118
62
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
# @param [RelatonW3c::W3cBibliographicItem] rel1 relation 1
|
123
|
-
# @param [RelatonW3c::W3cBibliographicItem] rel2 relation 2
|
124
|
-
#
|
125
|
-
# @return [Boolean] true if relations are same
|
126
|
-
#
|
127
|
-
def same_edition?(rel1, rel2)
|
128
|
-
return false unless rel1.type == "hasEdition" && rel1.type == rel2.type
|
129
|
-
|
130
|
-
ids1 = rel1.bibitem.docidentifier.map(&:id)
|
131
|
-
ids2 = rel2.bibitem.docidentifier.map(&:id)
|
132
|
-
(ids1 & ids2).any?
|
133
|
-
end
|
63
|
+
def fetch_spec(unrealized_spec)
|
64
|
+
spec = realize unrealized_spec
|
65
|
+
save_doc DataParser.parse(spec)
|
134
66
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
# @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
|
139
|
-
#
|
140
|
-
# @yield [RDF::Repository] RDF repository
|
141
|
-
#
|
142
|
-
def each_dataset(source, &_block) # rubocop:disable Metrics/MethodLength
|
143
|
-
case source
|
144
|
-
when "w3c-tr-archive"
|
145
|
-
Dir["w3c-tr-archive/*.rdf"].map do |f|
|
146
|
-
@files = []
|
147
|
-
yield RDF::Repository.load(f)
|
148
|
-
end
|
149
|
-
when "w3c-rdf"
|
150
|
-
@files = []
|
151
|
-
rdf = RDF::Repository.load("http://www.w3.org/2002/01/tr-automation/tr.rdf")
|
152
|
-
yield rdf
|
153
|
-
# parse_static_dataset
|
67
|
+
if spec.links.respond_to?(:version_history) && spec.links.version_history
|
68
|
+
version_history = realize spec.links.version_history
|
69
|
+
version_history.links.spec_versions.each { |version| save_doc DataParser.parse(realize version) }
|
154
70
|
end
|
155
|
-
end
|
156
|
-
|
157
|
-
#
|
158
|
-
# Parse static dataset
|
159
|
-
#
|
160
|
-
# def parse_static_dataset
|
161
|
-
# Dir[File.expand_path("../../data/*", __dir__)].each do |file|
|
162
|
-
# xml = File.read file, encoding: "UTF-8"
|
163
|
-
# save_doc BibXMLParser.parse(xml), warn_duplicate: false
|
164
|
-
# rescue StandardError => e
|
165
|
-
# warn "Error: document #{file} #{e.message}"
|
166
|
-
# warn e.backtrace.join("\n")
|
167
|
-
# end
|
168
|
-
# end
|
169
71
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
#
|
175
|
-
def query_versioned_docs(rdf)
|
176
|
-
sse = SPARQL.parse(%(
|
177
|
-
PREFIX : <http://www.w3.org/2001/02pd/rec54#>
|
178
|
-
PREFIX dc: <http://purl.org/dc/elements/1.1/>
|
179
|
-
PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
|
180
|
-
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
181
|
-
SELECT ?link ?title ?date
|
182
|
-
WHERE { ?link dc:title ?title ; dc:date ?date . }
|
183
|
-
))
|
184
|
-
rdf.query sse
|
185
|
-
end
|
72
|
+
if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
|
73
|
+
predecessor_versions = realize spec.links.predecessor_versions
|
74
|
+
predecessor_versions.links.predecessor_versions.each { |version| save_doc DataParser.parse(realize version) }
|
75
|
+
end
|
186
76
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
#
|
192
|
-
def query_unversioned_docs(rdf)
|
193
|
-
sse = SPARQL.parse(%(
|
194
|
-
PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
|
195
|
-
SELECT ?version_of
|
196
|
-
WHERE {
|
197
|
-
?link doc:versionOf ?version_of .
|
198
|
-
FILTER ( isURI(?link) && isURI(?version_of) && ?link != ?version_of )
|
199
|
-
}
|
200
|
-
))
|
201
|
-
rdf.query(sse).uniq { |s| s.version_of.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") }
|
77
|
+
if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
|
78
|
+
successor_versions = realize spec.links.successor_versions
|
79
|
+
successor_versions.links.successor_versions.each { |version| save_doc DataParser.parse(realize version) }
|
80
|
+
end
|
202
81
|
end
|
203
82
|
|
204
83
|
#
|
@@ -206,15 +85,9 @@ module RelatonW3c
|
|
206
85
|
#
|
207
86
|
# @param [RelatonW3c::W3cBibliographicItem, nil] bib bibliographic item
|
208
87
|
#
|
209
|
-
def save_doc(bib, warn_duplicate: true)
|
88
|
+
def save_doc(bib, warn_duplicate: true)
|
210
89
|
return unless bib
|
211
90
|
|
212
|
-
c = case @format
|
213
|
-
when "xml" then bib.to_xml(bibdata: true)
|
214
|
-
when "yaml" then bib.to_hash.to_yaml
|
215
|
-
else bib.send("to_#{@format}")
|
216
|
-
end
|
217
|
-
# id = bib.docidentifier.detect(&:primary)&.id || bib.formattedref.content
|
218
91
|
file = file_name(bib.docnumber)
|
219
92
|
if @files.include?(file)
|
220
93
|
Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
|
@@ -223,7 +96,15 @@ module RelatonW3c
|
|
223
96
|
@index.add pubid, file
|
224
97
|
@index1.add_or_update pubid.to_hash, file
|
225
98
|
@files << file
|
226
|
-
|
99
|
+
end
|
100
|
+
File.write file, serialize(bib), encoding: "UTF-8"
|
101
|
+
end
|
102
|
+
|
103
|
+
def serialize(bib)
|
104
|
+
case @format
|
105
|
+
when "xml" then bib.to_xml(bibdata: true)
|
106
|
+
when "yaml" then bib.to_hash.to_yaml
|
107
|
+
else bib.send("to_#{@format}")
|
227
108
|
end
|
228
109
|
end
|
229
110
|
|