relaton-w3c 1.7.2 → 1.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.rubocop.yml +1 -1
- data/Gemfile +1 -1
- data/README.adoc +29 -0
- data/grammars/basicdoc.rng +165 -20
- data/grammars/biblio.rng +5 -6
- data/lib/relaton_w3c/data_fethcer.rb +106 -0
- data/lib/relaton_w3c/data_parser.rb +205 -0
- data/lib/relaton_w3c/hash_converter.rb +1 -1
- data/lib/relaton_w3c/hit_collection.rb +7 -7
- data/lib/relaton_w3c/processor.rb +15 -2
- data/lib/relaton_w3c/scrapper.rb +17 -17
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliographic_item.rb +1 -1
- data/lib/relaton_w3c/workgroups.yaml +339 -0
- data/lib/relaton_w3c/xml_parser.rb +1 -1
- data/lib/relaton_w3c.rb +1 -0
- data/relaton_w3c.gemspec +6 -4
- metadata +65 -6
@@ -0,0 +1,205 @@
|
|
1
|
+
module RelatonW3c
|
2
|
+
class DataParser
|
3
|
+
#
|
4
|
+
# Document parser initalization
|
5
|
+
#
|
6
|
+
# @param [RDF::Query::Solution] sol entry from the SPARQL query
|
7
|
+
# @param [RelatonW3c::DataFetcher] fetcher data fetcher
|
8
|
+
#
|
9
|
+
def initialize(sol, fetcher)
|
10
|
+
@sol = sol
|
11
|
+
@fetcher = fetcher
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Initialize document parser and run it
|
16
|
+
#
|
17
|
+
# @param [RDF::Query::Solution] sol entry from the SPARQL query
|
18
|
+
# @param [RelatonW3c::DataFetcher] fetcher data fetcher
|
19
|
+
#
|
20
|
+
# @return [RelatonW3c:W3cBibliographicItem, nil] bibliographic item
|
21
|
+
#
|
22
|
+
def self.parse(sol, fetcher)
|
23
|
+
new(sol, fetcher).parse
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Parse document
|
28
|
+
#
|
29
|
+
# @return [RelatonW3c:W3cBibliographicItem, nil] bibliographic item
|
30
|
+
#
|
31
|
+
def parse # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
32
|
+
return unless @fetcher.class::USED_TYPES.include? type
|
33
|
+
|
34
|
+
RelatonW3c::W3cBibliographicItem.new(
|
35
|
+
type: "standard",
|
36
|
+
doctype: parse_doctype,
|
37
|
+
fetched: Date.today.to_s,
|
38
|
+
language: ["en"],
|
39
|
+
script: ["Latn"],
|
40
|
+
title: parse_title,
|
41
|
+
link: parse_link,
|
42
|
+
docid: parse_docid,
|
43
|
+
docnumber: identifier(@sol.link.to_s),
|
44
|
+
series: parse_series,
|
45
|
+
date: parse_date,
|
46
|
+
relation: parse_relation,
|
47
|
+
contributor: parse_contrib,
|
48
|
+
editorialgroup: parse_editorialgroup,
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Parse title
|
54
|
+
#
|
55
|
+
# @return [RelatonBib::TypedTitleStringCollection] title
|
56
|
+
#
|
57
|
+
def parse_title
|
58
|
+
t = RelatonBib::TypedTitleString.new content: @sol.title.to_s
|
59
|
+
RelatonBib::TypedTitleStringCollection.new [t]
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Parse link
|
64
|
+
#
|
65
|
+
# @return [Array<RelatonBib::TypedUri>] link
|
66
|
+
#
|
67
|
+
def parse_link
|
68
|
+
[RelatonBib::TypedUri.new(type: "src", content: @sol.link.to_s)]
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Parse docidentifier
|
73
|
+
#
|
74
|
+
# @return [Arra<RelatonBib::DocumentIdentifier>] docidentifier
|
75
|
+
#
|
76
|
+
def parse_docid
|
77
|
+
id = pub_id(@sol.link.to_s)
|
78
|
+
[RelatonBib::DocumentIdentifier.new(type: "W3C", id: id)]
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Generate PubID
|
83
|
+
#
|
84
|
+
# @param [String] url url
|
85
|
+
#
|
86
|
+
# @return [String] PubID
|
87
|
+
#
|
88
|
+
def pub_id(url)
|
89
|
+
"W3C #{identifier(url)}"
|
90
|
+
end
|
91
|
+
|
92
|
+
def identifier(url)
|
93
|
+
/.+\/(\w+(?:-[\w.]+)+(?:\/\w+)?)/.match(url)[1].to_s
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Parse series
|
98
|
+
#
|
99
|
+
# @return [Array<RelatonBib::Series>] series
|
100
|
+
#
|
101
|
+
def parse_series
|
102
|
+
title = RelatonBib::TypedTitleString.new content: "W3C #{type}"
|
103
|
+
[RelatonBib::Series.new(title: title, number: identifier(@sol.link.to_s))]
|
104
|
+
end
|
105
|
+
|
106
|
+
def type # rubocop:disable Metrics/MethodLength
|
107
|
+
@type ||= begin
|
108
|
+
sse = SPARQL.parse(%(
|
109
|
+
PREFIX : <http://www.w3.org/2001/02pd/rec54#>
|
110
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
111
|
+
SELECT ?type
|
112
|
+
WHERE {
|
113
|
+
{ <#{@sol.link}> rdf:type ?type }
|
114
|
+
}
|
115
|
+
))
|
116
|
+
tps = @fetcher.data.query(sse).map { |s| s.type.to_s.split("#").last }
|
117
|
+
tps.detect { |t| Scrapper::DOCTYPES.key?(t) }
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Parse doctype
|
123
|
+
#
|
124
|
+
# @return [Strinf] doctype
|
125
|
+
#
|
126
|
+
def parse_doctype
|
127
|
+
Scrapper::DOCTYPES[type]
|
128
|
+
end
|
129
|
+
|
130
|
+
def parse_date
|
131
|
+
[RelatonBib::BibliographicDate.new(type: "published", on: @sol.date.to_s)]
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Parse relation
|
136
|
+
#
|
137
|
+
# @return [Array<RelatonBib::DocumentRelation>] relation
|
138
|
+
#
|
139
|
+
def parse_relation # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
140
|
+
sse = SPARQL.parse(%(
|
141
|
+
PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
|
142
|
+
SELECT ?obsoletes
|
143
|
+
WHERE {
|
144
|
+
VALUES ?p { doc:obsoletes }
|
145
|
+
{ <#{@sol.link}> ?p ?obsoletes }
|
146
|
+
}
|
147
|
+
))
|
148
|
+
@fetcher.data.query(sse).order_by(:obsoletes).map do |r|
|
149
|
+
tp, url = r.to_h.first
|
150
|
+
fr = RelatonBib::LocalizedString.new pub_id(url.to_s)
|
151
|
+
bib = W3cBibliographicItem.new formattedref: fr
|
152
|
+
RelatonBib::DocumentRelation.new(type: tp.to_s, bibitem: bib)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Parse contributor
|
158
|
+
#
|
159
|
+
# @return [Array<RelatonBib::ContributionInfo>] contributor
|
160
|
+
#
|
161
|
+
def parse_contrib # rubocop:disable Metrics/MethodLength
|
162
|
+
sse = SPARQL.parse(%(
|
163
|
+
PREFIX : <http://www.w3.org/2001/02pd/rec54#>
|
164
|
+
PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
|
165
|
+
SELECT ?full_name
|
166
|
+
WHERE {
|
167
|
+
<#{@sol.link}> :editor/contact:fullName ?full_name
|
168
|
+
}
|
169
|
+
))
|
170
|
+
@fetcher.data.query(sse).order_by(:full_name).map do |ed|
|
171
|
+
cn = RelatonBib::LocalizedString.new(ed.full_name.to_s, "en", "Latn")
|
172
|
+
n = RelatonBib::FullName.new completename: cn
|
173
|
+
p = RelatonBib::Person.new name: n
|
174
|
+
RelatonBib::ContributionInfo.new entity: p, role: [type: "editor"]
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Parse editorialgroup
|
180
|
+
#
|
181
|
+
# @return [RelatonBib::EditorialGroup] editorialgroup
|
182
|
+
#
|
183
|
+
def parse_editorialgroup # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
184
|
+
sse = SPARQL.parse(%(
|
185
|
+
PREFIX org: <http://www.w3.org/2001/04/roadmap/org#>
|
186
|
+
PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
|
187
|
+
SELECT ?home_page
|
188
|
+
WHERE {
|
189
|
+
<#{@sol.link}> org:deliveredBy/contact:homePage ?home_page
|
190
|
+
}
|
191
|
+
))
|
192
|
+
res = @fetcher.data.query(sse).order_by(:home_page)
|
193
|
+
tc = res.each_with_object([]) do |edg, obj|
|
194
|
+
wg = @fetcher.group_names[edg.home_page.to_s.sub(/\/$/, "")]
|
195
|
+
if wg
|
196
|
+
rwg = RelatonBib::WorkGroup.new name: wg["name"]
|
197
|
+
obj << RelatonBib::TechnicalCommittee.new(rwg)
|
198
|
+
else
|
199
|
+
warn "Working group name not found for #{edg.home_page}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
RelatonBib::EditorialGroup.new tc
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
@@ -22,8 +22,8 @@ module RelatonW3c
|
|
22
22
|
# @param ref [String] reference to search
|
23
23
|
def initialize(ref)
|
24
24
|
%r{
|
25
|
-
^(W3C\s)?
|
26
|
-
(?<type>(CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
25
|
+
^(?:W3C\s)?
|
26
|
+
(?<type>(?:CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
27
27
|
Group\sNote|Proposed\sEdited\sRecommendation|Proposed\sRecommendation|
|
28
28
|
Recommendation|Retired|Working\sDraft))? # type
|
29
29
|
\s?
|
@@ -41,7 +41,7 @@ module RelatonW3c
|
|
41
41
|
# @param title_date [String]
|
42
42
|
# @param type [String]
|
43
43
|
# @return [Array<Hash>]
|
44
|
-
def from_yaml(title_date, type)
|
44
|
+
def from_yaml(title_date, type) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
45
45
|
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
46
|
title ||= title_date
|
47
47
|
result = data.select do |hit|
|
@@ -50,7 +50,7 @@ module RelatonW3c
|
|
50
50
|
type_date_filter(hit, type, date)
|
51
51
|
end
|
52
52
|
if result.empty?
|
53
|
-
result = data.select { |h| h["link"].split("/").last.match?
|
53
|
+
result = data.select { |h| h["link"].split("/").last.match?(/#{title}/) }
|
54
54
|
end
|
55
55
|
result.map { |h| Hit.new(h, self) }
|
56
56
|
end
|
@@ -109,7 +109,7 @@ module RelatonW3c
|
|
109
109
|
# @param type [String]
|
110
110
|
# @return [String]
|
111
111
|
def short_type(type)
|
112
|
-
tp = TYPES.select { |
|
112
|
+
tp = TYPES.select { |_, v| v == type }.keys
|
113
113
|
tp.first || type
|
114
114
|
end
|
115
115
|
|
@@ -137,7 +137,7 @@ module RelatonW3c
|
|
137
137
|
# fetch data form server and save it to file.
|
138
138
|
#
|
139
139
|
def fetch_data
|
140
|
-
resp = Net::HTTP.get_response URI.parse(DOMAIN
|
140
|
+
resp = Net::HTTP.get_response URI.parse("#{DOMAIN}/TR/")
|
141
141
|
# return if there aren't any changes since last fetching
|
142
142
|
return unless resp.code == "200"
|
143
143
|
|
@@ -153,7 +153,7 @@ module RelatonW3c
|
|
153
153
|
# @param h_el [Nokogiri::XML::Element]
|
154
154
|
# @param link [Nokogiri::XML::Element]
|
155
155
|
# @param pubdetails [Nokogiri::XML::Element]
|
156
|
-
def fetch_hit(h_el, link, pubdetails)
|
156
|
+
def fetch_hit(h_el, link, pubdetails) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
157
|
datepub = pubdetails.at("text()").text.match(/\d{4}-\d{2}-\d{2}/).to_s
|
158
158
|
editor = h_el.xpath("ul[@class='editorlist']/li").map { |e| e.text.strip }
|
159
159
|
keyword = h_el.xpath("ul[@class='taglist']/li").map { |e| e.text.strip }
|
@@ -4,11 +4,12 @@ module RelatonW3c
|
|
4
4
|
class Processor < Relaton::Processor
|
5
5
|
attr_reader :idtype
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
8
8
|
@short = :relaton_w3c
|
9
9
|
@prefix = "W3C"
|
10
10
|
@defaultprefix = %r{^W3C\s}
|
11
11
|
@idtype = "W3C"
|
12
|
+
@datasets = %w[w3c-rdf]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
@@ -19,6 +20,18 @@ module RelatonW3c
|
|
19
20
|
::RelatonW3c::W3cBibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the documents from http://www.w3.org/2002/01/tr-automation/tr.rdf
|
25
|
+
#
|
26
|
+
# @param [String] _source source name
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonCalconnect::CcBibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -29,7 +42,7 @@ module RelatonW3c
|
|
29
42
|
# @return [RelatonIsoBib::CcBibliographicItem]
|
30
43
|
def hash_to_bib(hash)
|
31
44
|
item_hash = ::RelatonW3c::HashConverter.hash_to_bib(hash)
|
32
|
-
::RelatonW3c::W3cBibliographicItem.new
|
45
|
+
::RelatonW3c::W3cBibliographicItem.new(**item_hash)
|
33
46
|
end
|
34
47
|
|
35
48
|
# Returns hash of XML grammar
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module RelatonW3c
|
2
2
|
class Scrapper
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}.freeze
|
3
|
+
DOCTYPES = {
|
4
|
+
"CR" => "candidateRecommendation",
|
5
|
+
"NOTE" => "groupNote",
|
6
|
+
"PER" => "proposedEditedRecommendation",
|
7
|
+
"PR" => "proposedRecommendation",
|
8
|
+
"REC" => "recommendation",
|
9
|
+
"RET" => "retired",
|
10
|
+
"WD" => "workingDraft",
|
11
|
+
}.freeze
|
13
12
|
|
13
|
+
class << self
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
16
|
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
@@ -29,7 +29,7 @@ module RelatonW3c
|
|
29
29
|
doctype: fetch_doctype(hit, doc),
|
30
30
|
contributor: fetch_contributor(hit, doc),
|
31
31
|
relation: fetch_relation(doc),
|
32
|
-
keyword: hit["keyword"]
|
32
|
+
keyword: hit["keyword"],
|
33
33
|
)
|
34
34
|
end
|
35
35
|
|
@@ -53,7 +53,7 @@ module RelatonW3c
|
|
53
53
|
titles << { content: title.gsub(/\n/, " "), type: "main" }
|
54
54
|
end
|
55
55
|
subtitle = doc.at(
|
56
|
-
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
56
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]",
|
57
57
|
)&.text
|
58
58
|
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
59
59
|
end
|
@@ -62,7 +62,7 @@ module RelatonW3c
|
|
62
62
|
end
|
63
63
|
titles.map do |t|
|
64
64
|
title = RelatonBib::FormattedString.new(
|
65
|
-
content: t[:content], language: "en", script: "Latn"
|
65
|
+
content: t[:content], language: "en", script: "Latn",
|
66
66
|
)
|
67
67
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
68
68
|
end
|
@@ -88,7 +88,7 @@ module RelatonW3c
|
|
88
88
|
# @param hit [Hash]
|
89
89
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
90
90
|
# @return [Array<RelatonBib::BibliographicDate>]
|
91
|
-
def fetch_date(hit, doc)
|
91
|
+
def fetch_date(hit, doc) # rubocop:disable Metrics/CyclomaticComplexity
|
92
92
|
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
93
93
|
on ||= fetch_date1(doc) || fetch_date2(doc)
|
94
94
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
@@ -143,7 +143,7 @@ module RelatonW3c
|
|
143
143
|
end
|
144
144
|
mem
|
145
145
|
end
|
146
|
-
contribs.map { |c| contrib_info
|
146
|
+
contribs.map { |c| contrib_info(**c) }
|
147
147
|
else
|
148
148
|
hit["editor"].map do |ed|
|
149
149
|
contrib_info name: ed, role: [{ type: "editor" }]
|
@@ -162,7 +162,7 @@ module RelatonW3c
|
|
162
162
|
# @param element [Nokogiri::XML::Element]
|
163
163
|
# @param type [String]
|
164
164
|
# @return [Hash]
|
165
|
-
def parse_contrib(element, type)
|
165
|
+
def parse_contrib(element, type) # rubocop:disable Metrics/MethodLength
|
166
166
|
p = element.at("a")
|
167
167
|
return unless p
|
168
168
|
|
@@ -187,7 +187,7 @@ module RelatonW3c
|
|
187
187
|
name = RelatonBib::FullName.new completename: completename
|
188
188
|
af = []
|
189
189
|
if args[:org]
|
190
|
-
org = RelatonBib::Organization.new
|
190
|
+
org = RelatonBib::Organization.new(**args[:org])
|
191
191
|
af << RelatonBib::Affiliation.new(organization: org)
|
192
192
|
end
|
193
193
|
en = RelatonBib::Person.new name: name, url: args[:url], affiliation: af
|
data/lib/relaton_w3c/version.rb
CHANGED