relaton-w3c 1.19.0 → 1.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dde70fdc615a88c4343add641e193635a5377af68180d96122b02af747aa50e5
4
- data.tar.gz: 363d74d0a170b307006310bbbe2f6e645da40aabf12dbc2a94ebac4887f2452f
3
+ metadata.gz: beb3bb1a8550059e754f001412336b78ab55771f25ebf33534dd92f59a724b9d
4
+ data.tar.gz: d32d55f604c16613c04d5d2ad5554e7c7ed8c6151ad0218411c47d58099b5a63
5
5
  SHA512:
6
- metadata.gz: fec1491eaa3108fd8726dfa0af50f54b98d7c28286c936d827088eed8ee7badbe6be430dd4588ad6324936f6cce8824379750a71ac4da9aa8d88788a47fceec4
7
- data.tar.gz: e9d4074458f1a01e12ab84619861dda851ce668573d3b3e843a2c7ba8effed42e6c94545b49157626ba922ebfc31b57ff9d5877461626d2578a9ebdf128d314c
6
+ metadata.gz: b2927bebbb35dde93cb2659b44e0ef47608ffa164ae33d138041a46139ca17b1a62061eb1055959fe91771bf397bf5ecb11f9ff2ca19055d7d804a65962a8974
7
+ data.tar.gz: 240f1c9f84770eb129a372a015b582c7f3ff6870eaf83e2dd723ff9538bbc19da34d58190234441b3c6b9dde224810e09c5e287473afa1183f23c53deabcc849
data/README.adoc CHANGED
@@ -108,16 +108,13 @@ RelatonW3c::W3cBibliographicItem.new **bib_hash
108
108
 
109
109
  === Fetch data
110
110
 
111
- The method `RelatonW3c::DataFetcher.fetch(source, output: "data", format: "yaml")` converts all the documents from the dataset and saves them to the `./data` folder in YAML format.
111
+ The method `RelatonW3c::DataFetcher.fetch(output: "data", format: "yaml")` converts all the documents from the dataset and saves them to the `./data` folder in YAML format.
112
112
  Arguments:
113
113
 
114
- - `source` - the name of the dataset (`w3c-rdf` or `w3c-tr-archive`)
115
114
  - `output` - folder to save documents (default './data').
116
115
  - `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
117
116
 
118
- The available datasets are:
119
- - `w3c-rdf` - The dataset is fetched from http://www.w3.org/2002/01/tr-automation/tr.rdf.
120
- - `w3c-tr-archive` - The archive dataset files should be downloaded from https://github.com/relaton/w3c-tr-archive repository and placed into `w3c-tr-archive` folder.
117
+ The method uses https://api.w3.org/doc API to fetch all the W3C documents.
121
118
 
122
119
  [source,ruby]
123
120
  ----
@@ -1,12 +1,10 @@
1
- require "rdf"
2
- require "linkeddata"
3
- require "sparql"
4
- require "mechanize"
5
- require "relaton_w3c/data_parser"
1
+ require "w3c_api"
2
+ require_relative "rate_limit_handler"
3
+ require_relative "data_parser"
6
4
 
7
5
  module RelatonW3c
8
6
  class DataFetcher
9
- attr_reader :data, :group_names
7
+ include RelatonW3c::RateLimitHandler
10
8
 
11
9
  #
12
10
  # Data fetcher initializer
@@ -18,8 +16,8 @@ module RelatonW3c
18
16
  @output = output
19
17
  @format = format
20
18
  @ext = format.sub(/^bib/, "")
21
- dir = File.dirname(File.expand_path(__FILE__))
22
- @group_names = YAML.load_file(File.join(dir, "workgroups.yaml"))
19
+ @files = Set.new
20
+ @fetched_urls = {}
23
21
  @index = DataIndex.create_from_file
24
22
  @index1 = Relaton::Index.find_or_create :W3C, file: "index1.yaml"
25
23
  end
@@ -27,178 +25,59 @@ module RelatonW3c
27
25
  #
28
26
  # Initialize fetcher and run fetch
29
27
  #
30
- # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
31
28
  # @param [Strin] output directory to save files, default: "data"
32
29
  # @param [Strin] format format of output files (xml, yaml, bibxml), default: yaml
33
30
  #
34
- def self.fetch(source, output: "data", format: "yaml")
31
+ def self.fetch(output: "data", format: "yaml")
35
32
  t1 = Time.now
36
33
  puts "Started at: #{t1}"
37
34
  FileUtils.mkdir_p output
38
- new(output, format).fetch source
35
+ new(output, format).fetch
39
36
  t2 = Time.now
40
37
  puts "Stopped at: #{t2}"
41
38
  puts "Done in: #{(t2 - t1).round} sec."
42
39
  end
43
40
 
44
- #
45
- # Parse documents
46
- #
47
- # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
48
- #
49
- def fetch(source) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
50
- each_dataset(source) do |rdf|
51
- %i[versioned unversioned].each do |type|
52
- send("query_#{type}_docs", rdf).each do |sl|
53
- bib = DataParser.parse(rdf, sl, self)
54
- add_has_edition_relation(bib) if type == :unversioned
55
- save_doc bib
56
- rescue StandardError => e
57
- link = sl.respond_to?(:link) ? sl.link : sl.version_of
58
- Util.error "Error: document #{link} #{e.message}\n#{e.backtrace.join("\n")}"
59
- end
60
- end
61
- end
62
- @index.sort!.save
63
- @index1.save
41
+ def client
42
+ @client ||= W3cApi::Client.new
64
43
  end
65
44
 
66
45
  #
67
- # Add hasEdition relations form previous parsed document
68
- #
69
- # @param [RelatonW3c::W3cBibliographicItem] bib bibligraphic item
46
+ # Parse documents
70
47
  #
71
- def add_has_edition_relation(bib) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
72
- file = file_name bib.docnumber
73
- if File.exist? file
74
- item = send "read_#{@format}", file
75
- item.relation.each do |r1|
76
- r1.type = "hasEdition" if r1.type == "instanceOf"
77
- same_edition = bib.relation.detect { |r2| same_edition?(r1, r2) }
78
- bib.relation << r1 unless same_edition
48
+ def fetch
49
+ specs = client.specifications
50
+ loop do
51
+ specs.links.specifications.each do |spec|
52
+ fetch_spec spec
79
53
  end
80
- end
81
- bib.relation.select { |r| r.type == "hasEdition" }
82
- .max_by { |r| r.bibitem.id.match(/(?<=-)\d{8}$/).to_s }&.type = "instanceOf"
83
- end
84
54
 
85
- #
86
- # Read XML file
87
- #
88
- # @param [String] file file name
89
- #
90
- # @return [RelatonW3c::W3cBibliographicItem] bibliographic item
91
- #
92
- def read_xml(file)
93
- XMLParser.from_xml(File.read(file, encoding: "UTF-8"))
94
- end
95
-
96
- #
97
- # Read YAML file
98
- #
99
- # @param [String] file file name
100
- #
101
- # @return [RelatonW3c::W3cBibliographicItem] bibliographic item
102
- #
103
- def read_yaml(file)
104
- hash = YAML.load_file(file)
105
- W3cBibliographicItem.from_hash(hash)
106
- end
55
+ break unless specs.next?
107
56
 
108
- #
109
- # Read BibXML file
110
- #
111
- # @param [String] file file name
112
- #
113
- # @return [RelatonW3c::W3cBibliographicItem] bibliographic item
114
- #
115
- def read_bibxml(file)
116
- BibXMLParser.parse File.read(file, encoding: "UTF-8")
57
+ specs = specs.next
58
+ end
59
+ @index.sort!.save
60
+ @index1.save
117
61
  end
118
62
 
119
- #
120
- # Compare two relations
121
- #
122
- # @param [RelatonW3c::W3cBibliographicItem] rel1 relation 1
123
- # @param [RelatonW3c::W3cBibliographicItem] rel2 relation 2
124
- #
125
- # @return [Boolean] true if relations are same
126
- #
127
- def same_edition?(rel1, rel2)
128
- return false unless rel1.type == "hasEdition" && rel1.type == rel2.type
129
-
130
- ids1 = rel1.bibitem.docidentifier.map(&:id)
131
- ids2 = rel2.bibitem.docidentifier.map(&:id)
132
- (ids1 & ids2).any?
133
- end
63
+ def fetch_spec(unrealized_spec)
64
+ spec = realize unrealized_spec
65
+ save_doc DataParser.parse(spec)
134
66
 
135
- #
136
- # Yield fetching for each dataset
137
- #
138
- # @param [String] source source name "w3c-tr-archive" or "w3c-rdf"
139
- #
140
- # @yield [RDF::Repository] RDF repository
141
- #
142
- def each_dataset(source, &_block) # rubocop:disable Metrics/MethodLength
143
- case source
144
- when "w3c-tr-archive"
145
- Dir["w3c-tr-archive/*.rdf"].map do |f|
146
- @files = []
147
- yield RDF::Repository.load(f)
148
- end
149
- when "w3c-rdf"
150
- @files = []
151
- rdf = RDF::Repository.load("http://www.w3.org/2002/01/tr-automation/tr.rdf")
152
- yield rdf
153
- # parse_static_dataset
67
+ if spec.links.respond_to?(:version_history) && spec.links.version_history
68
+ version_history = realize spec.links.version_history
69
+ version_history.links.spec_versions.each { |version| save_doc DataParser.parse(realize version) }
154
70
  end
155
- end
156
-
157
- #
158
- # Parse static dataset
159
- #
160
- # def parse_static_dataset
161
- # Dir[File.expand_path("../../data/*", __dir__)].each do |file|
162
- # xml = File.read file, encoding: "UTF-8"
163
- # save_doc BibXMLParser.parse(xml), warn_duplicate: false
164
- # rescue StandardError => e
165
- # warn "Error: document #{file} #{e.message}"
166
- # warn e.backtrace.join("\n")
167
- # end
168
- # end
169
71
 
170
- #
171
- # Query RDF source for versioned documents
172
- #
173
- # @return [RDF::Query::Solutions] query results
174
- #
175
- def query_versioned_docs(rdf)
176
- sse = SPARQL.parse(%(
177
- PREFIX : <http://www.w3.org/2001/02pd/rec54#>
178
- PREFIX dc: <http://purl.org/dc/elements/1.1/>
179
- PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
180
- PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
181
- SELECT ?link ?title ?date
182
- WHERE { ?link dc:title ?title ; dc:date ?date . }
183
- ))
184
- rdf.query sse
185
- end
72
+ if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
73
+ predecessor_versions = realize spec.links.predecessor_versions
74
+ predecessor_versions.links.predecessor_versions.each { |version| save_doc DataParser.parse(realize version) }
75
+ end
186
76
 
187
- #
188
- # Query RDF source for unversioned documents
189
- #
190
- # @return [Array<RDF::Query::Solution>] query results
191
- #
192
- def query_unversioned_docs(rdf)
193
- sse = SPARQL.parse(%(
194
- PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
195
- SELECT ?version_of
196
- WHERE {
197
- ?link doc:versionOf ?version_of .
198
- FILTER ( isURI(?link) && isURI(?version_of) && ?link != ?version_of )
199
- }
200
- ))
201
- rdf.query(sse).uniq { |s| s.version_of.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") }
77
+ if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
78
+ successor_versions = realize spec.links.successor_versions
79
+ successor_versions.links.successor_versions.each { |version| save_doc DataParser.parse(realize version) }
80
+ end
202
81
  end
203
82
 
204
83
  #
@@ -206,15 +85,9 @@ module RelatonW3c
206
85
  #
207
86
  # @param [RelatonW3c::W3cBibliographicItem, nil] bib bibliographic item
208
87
  #
209
- def save_doc(bib, warn_duplicate: true) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
88
+ def save_doc(bib, warn_duplicate: true)
210
89
  return unless bib
211
90
 
212
- c = case @format
213
- when "xml" then bib.to_xml(bibdata: true)
214
- when "yaml" then bib.to_hash.to_yaml
215
- else bib.send("to_#{@format}")
216
- end
217
- # id = bib.docidentifier.detect(&:primary)&.id || bib.formattedref.content
218
91
  file = file_name(bib.docnumber)
219
92
  if @files.include?(file)
220
93
  Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
@@ -223,7 +96,15 @@ module RelatonW3c
223
96
  @index.add pubid, file
224
97
  @index1.add_or_update pubid.to_hash, file
225
98
  @files << file
226
- File.write file, c, encoding: "UTF-8"
99
+ end
100
+ File.write file, serialize(bib), encoding: "UTF-8"
101
+ end
102
+
103
+ def serialize(bib)
104
+ case @format
105
+ when "xml" then bib.to_xml(bibdata: true)
106
+ when "yaml" then bib.to_hash.to_yaml
107
+ else bib.send("to_#{@format}")
227
108
  end
228
109
  end
229
110