relaton-w3c 1.11.5 → 1.12.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,22 +20,21 @@ module RelatonW3c
20
20
  @ext = format.sub(/^bib/, "")
21
21
  dir = File.dirname(File.expand_path(__FILE__))
22
22
  @group_names = YAML.load_file(File.join(dir, "workgroups.yaml"))
23
- @data = RDF::Repository.load("http://www.w3.org/2002/01/tr-automation/tr.rdf")
24
- @files = []
25
23
  @index = DataIndex.new
26
24
  end
27
25
 
28
26
  #
29
27
  # Initialize fetcher and run fetch
30
28
  #
29
+ # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
31
30
  # @param [Strin] output directory to save files, default: "data"
32
31
  # @param [Strin] format format of output files (xml, yaml, bibxml), default: yaml
33
32
  #
34
- def self.fetch(output: "data", format: "yaml")
33
+ def self.fetch(source, output: "data", format: "yaml")
35
34
  t1 = Time.now
36
35
  puts "Started at: #{t1}"
37
- FileUtils.mkdir_p output unless Dir.exist? output
38
- new(output, format).fetch
36
+ FileUtils.mkdir_p output
37
+ new(output, format).fetch source
39
38
  t2 = Time.now
40
39
  puts "Stopped at: #{t2}"
41
40
  puts "Done in: #{(t2 - t1).round} sec."
@@ -44,19 +43,89 @@ module RelatonW3c
44
43
  #
45
44
  # Parse documents
46
45
  #
47
- def fetch # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
48
- query_versioned_docs.each do |sl|
49
- save_doc DataParser.parse(sl, self)
50
- rescue StandardError => e
51
- warn "Error: document #{sl.link} #{e.message}"
52
- warn e.backtrace.join("\n")
46
+ # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
47
+ #
48
+ def fetch(source) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
49
+ each_dataset(source) do |rdf|
50
+ %i[versioned unversioned].each do |type|
51
+ send("query_#{type}_docs", rdf).each do |sl|
52
+ bib = DataParser.parse(rdf, sl, self)
53
+ add_has_edition_relation(bib) if type == :unversioned
54
+ save_doc bib
55
+ rescue StandardError => e
56
+ link = sl.respond_to?(:link) ? sl.link : sl.version_of
57
+ warn "Error: document #{link} #{e.message}"
58
+ warn e.backtrace.join("\n")
59
+ end
60
+ end
53
61
  end
54
- query_unversioned_docs.each do |sl|
55
- save_doc DataParser.parse(sl, self)
56
- rescue StandardError => e
57
- warn "Error: document #{sl.version_of} #{e.message}"
58
- warn e.backtrace.join("\n")
62
+ @index.sort!.save
63
+ end
64
+
65
+ #
66
+ # Add hasEdition relations form previous parsed document
67
+ #
68
+ # @param [RelatonW3c::W3cBibliographicItem] bib bibligraphic item
69
+ #
70
+ def add_has_edition_relation(bib) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity
71
+ file = file_name bib.docnumber
72
+ return unless File.exist? file
73
+
74
+ b = case @format
75
+ when "xml" then XMLParser.from_xml(File.read(file, encoding: "UTF-8"))
76
+ when "yaml"
77
+ hash = YAML.load_file(file)
78
+ W3cBibliographicItem.from_hash(hash)
79
+ when "bibxml" then BibXMLParser.parse File.read(file, encoding: "UTF-8")
80
+ end
81
+ b.relation.each do |r|
82
+ same_edition = bib.relation.detect { |r2| same_edition?(r, r2) }
83
+ bib.relation << r unless same_edition
59
84
  end
85
+ end
86
+
87
+ #
88
+ # Compare two relations
89
+ #
90
+ # @param [RelatonW3c::W3cBibliographicItem] rel1 relation 1
91
+ # @param [RelatonW3c::W3cBibliographicItem] rel2 relation 2
92
+ #
93
+ # @return [Boolean] true if relations are same
94
+ #
95
+ def same_edition?(rel1, rel2)
96
+ return false unless rel1.type == "hasEdition" && rel1.type == rel2.type
97
+
98
+ ids1 = rel1.bibitem.docidentifier.map(&:id)
99
+ ids2 = rel2.bibitem.docidentifier.map(&:id)
100
+ (ids1 & ids2).any?
101
+ end
102
+
103
+ #
104
+ # Yield fetching for each dataset
105
+ #
106
+ # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
107
+ #
108
+ # @yield [RDF::Repository] RDF repository
109
+ #
110
+ def each_dataset(source, &_block) # rubocop:disable Metrics/MethodLength
111
+ case source
112
+ when "w3c-tr-archive"
113
+ Dir["w3c-tr-archive/*.rdf"].map do |f|
114
+ @files = []
115
+ yield RDF::Repository.load(f)
116
+ end
117
+ when "w3c-rdf"
118
+ @files = []
119
+ rdf = RDF::Repository.load("http://www.w3.org/2002/01/tr-automation/tr.rdf")
120
+ yield rdf
121
+ parse_static_dataset
122
+ end
123
+ end
124
+
125
+ #
126
+ # Parse static dataset
127
+ #
128
+ def parse_static_dataset
60
129
  Dir[File.expand_path("../../data/*", __dir__)].each do |file|
61
130
  xml = File.read file, encoding: "UTF-8"
62
131
  save_doc BibXMLParser.parse(xml), warn_duplicate: false
@@ -64,36 +133,40 @@ module RelatonW3c
64
133
  warn "Error: document #{file} #{e.message}"
65
134
  warn e.backtrace.join("\n")
66
135
  end
67
- @index.sort!.save
68
136
  end
69
137
 
70
138
  #
71
- # Query RDF source for documents
139
+ # Query RDF source for versioned documents
72
140
  #
73
141
  # @return [RDF::Query::Solutions] query results
74
142
  #
75
- def query_versioned_docs # rubocop:disable Metrics/MethodLength
143
+ def query_versioned_docs(rdf)
76
144
  sse = SPARQL.parse(%(
77
145
  PREFIX : <http://www.w3.org/2001/02pd/rec54#>
78
146
  PREFIX dc: <http://purl.org/dc/elements/1.1/>
79
147
  PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
80
- # PREFIX mat: <http://www.w3.org/2002/05/matrix/vocab#>
81
148
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
82
- SELECT ?link ?title ?date ?version_of
83
- WHERE {
84
- ?link dc:title ?title ; dc:date ?date ; doc:versionOf ?version_of .
85
- }
149
+ SELECT ?link ?title ?date
150
+ WHERE { ?link dc:title ?title ; dc:date ?date . }
86
151
  ))
87
- data.query sse
152
+ rdf.query sse
88
153
  end
89
154
 
90
- def query_unversioned_docs
155
+ #
156
+ # Query RDF source for unversioned documents
157
+ #
158
+ # @return [Array<RDF::Query::Solution>] query results
159
+ #
160
+ def query_unversioned_docs(rdf)
91
161
  sse = SPARQL.parse(%(
92
162
  PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
93
163
  SELECT ?version_of
94
- WHERE { ?x doc:versionOf ?version_of . }
164
+ WHERE {
165
+ ?link doc:versionOf ?version_of .
166
+ FILTER ( isURI(?link) && isURI(?version_of) && ?link != ?version_of )
167
+ }
95
168
  ))
96
- data.query(sse).uniq &:version_of
169
+ rdf.query(sse).uniq { |s| s.version_of.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") }
97
170
  end
98
171
 
99
172
  #
@@ -14,15 +14,24 @@ module RelatonW3c
14
14
  end
15
15
 
16
16
  #
17
- # Add document to index
17
+ # Add document to index or update it if already exists
18
18
  #
19
19
  # @param [String] docnumber document number
20
20
  # @param [String] file path to document file
21
21
  #
22
- def add(docnumber, file)
22
+ def add(docnumber, file) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
23
23
  dnparts = self.class.docnumber_to_parts docnumber
24
- dnparts[:file] = file
25
- @index << dnparts
24
+ rec = @index.detect { |i| i[:file] == file }
25
+ if rec
26
+ rec[:code] = dnparts[:code]
27
+ dnparts[:stage] ? rec[:stage] = dnparts[:stage] : rec.delete(:stage)
28
+ dnparts[:type] ? rec[:type] = dnparts[:type] : rec.delete(:type)
29
+ dnparts[:date] ? rec[:date] = dnparts[:date] : rec.delete(:date)
30
+ dnparts[:suff] ? rec[:suff] = dnparts[:suff] : rec.delete(:suff)
31
+ else
32
+ dnparts[:file] = file
33
+ @index << dnparts
34
+ end
26
35
  end
27
36
 
28
37
  #
@@ -111,18 +120,24 @@ module RelatonW3c
111
120
  #
112
121
  # @return [RelatonW3c::DataIndex] data index
113
122
  #
114
- def create_from_repo # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
115
- resp = Zip::InputStream.new URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
123
+ def create_from_repo
124
+ uri = URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
125
+ resp = Zip::InputStream.new uri
116
126
  zip = resp.get_next_entry
127
+ index = RelatonBib.parse_yaml(zip.get_input_stream.read, [Symbol])
128
+ new index: index
129
+ end
117
130
 
118
- # Newer versions of Psych uses the `permitted_classes:` parameter
119
- index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
120
- YAML.safe_load(zip.get_input_stream.read, permitted_classes: [Symbol])
121
- else
122
- YAML.safe_load(zip.get_input_stream.read, [Symbol])
123
- end
124
-
125
- DataIndex.new index: index
131
+ #
132
+ # Create index from a file
133
+ #
134
+ # @param [String] index_file path to index file
135
+ #
136
+ # @return [RelatonW3c::DataIndex] data index
137
+ #
138
+ def create_from_file(index_file = "index-w3c.yaml")
139
+ index = RelatonBib.parse_yaml(File.read(index_file), [Symbol])
140
+ new index_file: index_file, index: index
126
141
  end
127
142
 
128
143
  #
@@ -25,7 +25,8 @@ module RelatonW3c
25
25
  # @param [RDF::Query::Solution] sol entry from the SPARQL query
26
26
  # @param [RelatonW3c::DataFetcher] fetcher data fetcher
27
27
  #
28
- def initialize(sol, fetcher)
28
+ def initialize(rdf, sol, fetcher)
29
+ @rdf = rdf
29
30
  @sol = sol
30
31
  @fetcher = fetcher
31
32
  end
@@ -38,8 +39,8 @@ module RelatonW3c
38
39
  #
39
40
  # @return [RelatonW3c:W3cBibliographicItem, nil] bibliographic item
40
41
  #
41
- def self.parse(sol, fetcher)
42
- new(sol, fetcher).parse
42
+ def self.parse(rdf, sol, fetcher)
43
+ new(rdf, sol, fetcher).parse
43
44
  end
44
45
 
45
46
  #
@@ -86,9 +87,10 @@ module RelatonW3c
86
87
  # @return [RelatonBib::TypedTitleStringCollection] title
87
88
  #
88
89
  def parse_title
89
- return [] unless @sol.respond_to?(:title)
90
-
91
- t = RelatonBib::TypedTitleString.new content: @sol.title.to_s
90
+ content = if @sol.respond_to?(:title) then @sol.title.to_s
91
+ else document_versions.max_by { |dv| dv.date.to_s }.title.to_s
92
+ end
93
+ t = RelatonBib::TypedTitleString.new content: content
92
94
  RelatonBib::TypedTitleStringCollection.new [t]
93
95
  end
94
96
 
@@ -99,8 +101,7 @@ module RelatonW3c
99
101
  #
100
102
  def parse_link
101
103
  link = @sol.respond_to?(:link) ? @sol.link : @sol.version_of
102
-
103
- [RelatonBib::TypedUri.new(type: "src", content: link.to_s)]
104
+ [RelatonBib::TypedUri.new(type: "src", content: link.to_s.strip)]
104
105
  end
105
106
 
106
107
  #
@@ -109,9 +110,7 @@ module RelatonW3c
109
110
  # @return [Arra<RelatonBib::DocumentIdentifier>] docidentifier
110
111
  #
111
112
  def parse_docid
112
- return [] unless @sol.respond_to?(:link)
113
-
114
- id = pub_id(@sol.link)
113
+ id = @sol.respond_to?(:link) ? pub_id(@sol.link) : pub_id(@sol.version_of)
115
114
  [RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true)]
116
115
  end
117
116
 
@@ -133,7 +132,7 @@ module RelatonW3c
133
132
  #
134
133
  def identifier(link = nil)
135
134
  url = link || (@sol.respond_to?(:link) ? @sol.link : @sol.version_of)
136
- self.class.parse_identifier(url.to_s)
135
+ self.class.parse_identifier(url.to_s.strip)
137
136
  end
138
137
 
139
138
  #
@@ -144,7 +143,7 @@ module RelatonW3c
144
143
  # @return [String] identifier
145
144
  #
146
145
  def self.parse_identifier(url)
147
- if /.+\/(\w+(?:-[\w.]+)+(?:\/\w+)?)/ =~ url.to_s
146
+ if /.+\/(\w+(?:[-+][\w.]+)+(?:\/\w+)?)/ =~ url.to_s
148
147
  $1.to_s
149
148
  else url.to_s.split("/").last
150
149
  end
@@ -186,10 +185,10 @@ module RelatonW3c
186
185
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
187
186
  SELECT ?type
188
187
  WHERE {
189
- { <#{@sol.link}> rdf:type ?type }
188
+ { <#{@sol.link.to_s.strip}> rdf:type ?type }
190
189
  }
191
190
  ))
192
- @fetcher.data.query(sse).map { |s| s.type.to_s.split("#").last }
191
+ @rdf.query(sse).map { |s| s.type.to_s.split("#").last }
193
192
  end
194
193
  end
195
194
 
@@ -221,10 +220,16 @@ module RelatonW3c
221
220
  def parse_relation
222
221
  if @sol.respond_to?(:link)
223
222
  relations + editor_drafts
224
- else document_versions
223
+ else
224
+ document_versions.map { |r| create_relation(r.link.to_s.strip, "hasEdition") }
225
225
  end
226
226
  end
227
227
 
228
+ #
229
+ # Create relations
230
+ #
231
+ # @return [Array<RelatonBib::DocumentRelation>] relations
232
+ #
228
233
  def relations # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
229
234
  {
230
235
  "doc:obsoletes" => { type: "obsoletes" },
@@ -234,53 +239,104 @@ module RelatonW3c
234
239
  ":previousEdition" => { type: "editionOf" },
235
240
  }.reduce([]) do |acc, (predicate, tp)|
236
241
  acc + relation_query(predicate).map do |r|
237
- fr = RelatonBib::LocalizedString.new pub_id(r.rel.to_s)
238
- bib = W3cBibliographicItem.new formattedref: fr
239
- tp[:description] = RelatonBib::FormattedString.new content: tp[:description] if tp[:description]
240
- RelatonBib::DocumentRelation.new(**tp, bibitem: bib)
242
+ create_relation(r.rel.to_s, tp[:type], tp[:description])
241
243
  end
242
244
  end
243
245
  end
244
246
 
247
+ #
248
+ # Parse editor drafts relation
249
+ #
250
+ # @return [Array<RelatonBib::DocumentRelation>] relation
251
+ #
245
252
  def editor_drafts # rubocop:disable Metrics/MethodLength
246
253
  sse = SPARQL.parse(%(
247
254
  PREFIX : <http://www.w3.org/2001/02pd/rec54#>
248
255
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
249
256
  SELECT ?rel
250
- WHERE { <#{@sol.link}> :ED ?rel . }
257
+ WHERE { <#{@sol.link.to_s.strip}> :ED ?rel . }
251
258
  ))
252
- @fetcher.data.query(sse).map do |s|
253
- fr = RelatonBib::LocalizedString.new pub_id(s.rel.to_s)
254
- bib = W3cBibliographicItem.new formattedref: fr
255
- desc = RelatonBib::FormattedString.new content: "Editor's draft"
256
- RelatonBib::DocumentRelation.new(
257
- type: "hasDraft", description: desc, bibitem: bib,
258
- )
259
+ @rdf.query(sse).map do |s|
260
+ create_relation(s.rel.to_s, "hasDraft", "Editor's draft")
259
261
  end
260
262
  end
261
263
 
264
+ #
265
+ # Query for relations
266
+ #
267
+ # @param [String] predicate relation type
268
+ #
269
+ # @return [RDF::Query::Solutions] query result
270
+ #
262
271
  def relation_query(predicate)
263
272
  sse = SPARQL.parse(%(
264
273
  PREFIX : <http://www.w3.org/2001/02pd/rec54#>
265
274
  PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
266
275
  PREFIX mat: <http://www.w3.org/2002/05/matrix/vocab#>
267
276
  SELECT ?rel
268
- WHERE { <#{@sol.link}> #{predicate} ?rel . }
277
+ WHERE { <#{@sol.link.to_s.strip}> #{predicate} ?rel . }
269
278
  ))
270
- @fetcher.data.query(sse).order_by(:rel)
279
+ @rdf.query(sse).order_by(:rel)
280
+ end
281
+
282
+ #
283
+ # Query document versions relations
284
+ #
285
+ # @return [Array<RDF::Query::Solution>] query results
286
+ #
287
+ def document_versions # rubocop:disable Metrics/MethodLength
288
+ @document_versions ||= version_of.each_with_object([]) do |s, acc|
289
+ sse = SPARQL.parse(%(
290
+ PREFIX : <http://www.w3.org/2001/02pd/rec54#>
291
+ PREFIX dc: <http://purl.org/dc/elements/1.1/>
292
+ PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
293
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
294
+ SELECT ?link ?title ?date
295
+ WHERE {
296
+ ?link doc:versionOf <#{s.version_of}> ;
297
+ dc:title ?title ;
298
+ dc:date ?date .
299
+ }
300
+ ))
301
+ @rdf.query(sse).each { |r| acc << r }
302
+ end
271
303
  end
272
304
 
273
- def document_versions
305
+ #
306
+ # Query for document versions
307
+ #
308
+ # @return [RDF::Query::Solutions] query results
309
+ #
310
+ def version_of
311
+ return [@sol] unless @sol.respond_to?(:link)
312
+
274
313
  sse = SPARQL.parse(%(
275
314
  PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
276
- SELECT ?link
277
- WHERE { ?link doc:versionOf <#{@sol.version_of}> }
315
+ SELECT ?version_of
316
+ WHERE {
317
+ <#{@sol.link.to_s.strip}> doc:versionOf ?version_of .
318
+ FILTER ( isURI(?version_of) && <#{@sol.link.to_s.strip}> != str(?version_of) )
319
+ }
278
320
  ))
279
- @fetcher.data.query(sse).map do |r|
280
- fref = RelatonBib::FormattedRef.new content: pub_id(r.link)
281
- bib = W3cBibliographicItem.new formattedref: fref
282
- RelatonBib::DocumentRelation.new(type: "hasEdition", bibitem: bib)
283
- end
321
+ @rdf.query(sse)
322
+ end
323
+
324
+ #
325
+ # Create relation
326
+ #
327
+ # @param [String] url relation URL
328
+ # @param [String] type relation type
329
+ # @param [String, nil] desc relation description
330
+ #
331
+ # @return [RelatonBib::DocumentRelation] <description>
332
+ #
333
+ def create_relation(url, type, desc = nil)
334
+ id = pub_id(url)
335
+ fref = RelatonBib::FormattedRef.new content: id
336
+ docid = RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true)
337
+ bib = W3cBibliographicItem.new formattedref: fref, docid: [docid]
338
+ dsc = RelatonBib::FormattedString.new content: desc if desc
339
+ RelatonBib::DocumentRelation.new(type: type, bibitem: bib, description: dsc)
284
340
  end
285
341
 
286
342
  #
@@ -307,10 +363,10 @@ module RelatonW3c
307
363
  PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
308
364
  SELECT ?full_name
309
365
  WHERE {
310
- <#{@sol.link}> :editor/contact:fullName ?full_name
366
+ <#{@sol.link.to_s.strip}> :editor/contact:fullName ?full_name
311
367
  }
312
368
  ))
313
- @fetcher.data.query(sse).order_by(:full_name).map do |ed|
369
+ @rdf.query(sse).order_by(:full_name).map do |ed|
314
370
  cn = RelatonBib::LocalizedString.new(ed.full_name.to_s, "en", "Latn")
315
371
  n = RelatonBib::FullName.new completename: cn
316
372
  p = RelatonBib::Person.new name: n
@@ -331,12 +387,13 @@ module RelatonW3c
331
387
  PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
332
388
  SELECT ?home_page
333
389
  WHERE {
334
- <#{@sol.link}> org:deliveredBy/contact:homePage ?home_page
390
+ <#{@sol.link.to_s.strip}> org:deliveredBy/contact:homePage ?home_page
335
391
  }
336
392
  ))
337
- res = @fetcher.data.query(sse).order_by(:home_page)
393
+ res = @rdf.query(sse).order_by(:home_page)
338
394
  tc = res.each_with_object([]) do |edg, obj|
339
- wg = @fetcher.group_names[edg.home_page.to_s.sub(/\/$/, "")]
395
+ group_path = edg.home_page.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "")
396
+ wg = @fetcher.group_names[group_path]
340
397
  if wg
341
398
  rwg = RelatonBib::WorkGroup.new name: wg["name"]
342
399
  obj << RelatonBib::TechnicalCommittee.new(rwg)
@@ -9,7 +9,7 @@ module RelatonW3c
9
9
  @prefix = "W3C"
10
10
  @defaultprefix = %r{^W3C\s}
11
11
  @idtype = "W3C"
12
- @datasets = %w[w3c-rdf]
12
+ @datasets = %w[w3c-rdf w3c-tr-archive]
13
13
  end
14
14
 
15
15
  # @param code [String]
@@ -28,8 +28,8 @@ module RelatonW3c
28
28
  # @option opts [String] :output directory to output documents
29
29
  # @option opts [String] :format
30
30
  #
31
- def fetch_data(_source, opts)
32
- DataFetcher.fetch(**opts)
31
+ def fetch_data(source, opts)
32
+ DataFetcher.fetch(source, **opts)
33
33
  end
34
34
 
35
35
  # @param xml [String]
@@ -1,3 +1,3 @@
1
1
  module RelatonW3c
2
- VERSION = "1.11.5".freeze
2
+ VERSION = "1.12.1".freeze
3
3
  end