relaton-w3c 1.7.1 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.rubocop.yml +1 -1
- data/Gemfile +1 -1
- data/README.adoc +29 -0
- data/grammars/basicdoc.rng +165 -20
- data/grammars/biblio.rng +5 -6
- data/lib/relaton_w3c/data_fethcer.rb +106 -0
- data/lib/relaton_w3c/data_parser.rb +205 -0
- data/lib/relaton_w3c/hash_converter.rb +1 -1
- data/lib/relaton_w3c/hit_collection.rb +7 -7
- data/lib/relaton_w3c/processor.rb +15 -2
- data/lib/relaton_w3c/scrapper.rb +17 -17
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliographic_item.rb +1 -1
- data/lib/relaton_w3c/workgroups.yaml +339 -0
- data/lib/relaton_w3c/xml_parser.rb +1 -1
- data/lib/relaton_w3c.rb +1 -0
- data/relaton_w3c.gemspec +6 -4
- metadata +65 -6
@@ -0,0 +1,205 @@
|
|
1
|
+
module RelatonW3c
|
2
|
+
class DataParser
|
3
|
+
#
|
4
|
+
# Document parser initalization
|
5
|
+
#
|
6
|
+
# @param [RDF::Query::Solution] sol entry from the SPARQL query
|
7
|
+
# @param [RelatonW3c::DataFetcher] fetcher data fetcher
|
8
|
+
#
|
9
|
+
def initialize(sol, fetcher)
|
10
|
+
@sol = sol
|
11
|
+
@fetcher = fetcher
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Initialize document parser and run it
|
16
|
+
#
|
17
|
+
# @param [RDF::Query::Solution] sol entry from the SPARQL query
|
18
|
+
# @param [RelatonW3c::DataFetcher] fetcher data fetcher
|
19
|
+
#
|
20
|
+
# @return [RelatonW3c:W3cBibliographicItem, nil] bibliographic item
|
21
|
+
#
|
22
|
+
def self.parse(sol, fetcher)
|
23
|
+
new(sol, fetcher).parse
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Parse document
|
28
|
+
#
|
29
|
+
# @return [RelatonW3c:W3cBibliographicItem, nil] bibliographic item
|
30
|
+
#
|
31
|
+
def parse # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
32
|
+
return unless @fetcher.class::USED_TYPES.include? type
|
33
|
+
|
34
|
+
RelatonW3c::W3cBibliographicItem.new(
|
35
|
+
type: "standard",
|
36
|
+
doctype: parse_doctype,
|
37
|
+
fetched: Date.today.to_s,
|
38
|
+
language: ["en"],
|
39
|
+
script: ["Latn"],
|
40
|
+
title: parse_title,
|
41
|
+
link: parse_link,
|
42
|
+
docid: parse_docid,
|
43
|
+
docnumber: identifier(@sol.link.to_s),
|
44
|
+
series: parse_series,
|
45
|
+
date: parse_date,
|
46
|
+
relation: parse_relation,
|
47
|
+
contributor: parse_contrib,
|
48
|
+
editorialgroup: parse_editorialgroup,
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Parse title
|
54
|
+
#
|
55
|
+
# @return [RelatonBib::TypedTitleStringCollection] title
|
56
|
+
#
|
57
|
+
def parse_title
|
58
|
+
t = RelatonBib::TypedTitleString.new title: @sol.title.to_s
|
59
|
+
RelatonBib::TypedTitleStringCollection.new [t]
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Parse link
|
64
|
+
#
|
65
|
+
# @return [Array<RelatonBib::TypedUri>] link
|
66
|
+
#
|
67
|
+
def parse_link
|
68
|
+
[RelatonBib::TypedUri.new(type: "src", content: @sol.link.to_s)]
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Parse docidentifier
|
73
|
+
#
|
74
|
+
# @return [Arra<RelatonBib::DocumentIdentifier>] docidentifier
|
75
|
+
#
|
76
|
+
def parse_docid
|
77
|
+
id = pub_id(@sol.link.to_s)
|
78
|
+
[RelatonBib::DocumentIdentifier.new(type: "W3C", id: id)]
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Generate PubID
|
83
|
+
#
|
84
|
+
# @param [String] url url
|
85
|
+
#
|
86
|
+
# @return [String] PubID
|
87
|
+
#
|
88
|
+
def pub_id(url)
|
89
|
+
"W3C #{identifier(url)}"
|
90
|
+
end
|
91
|
+
|
92
|
+
def identifier(url)
|
93
|
+
/.+\/(\w+(?:-[\w.]+)+(?:\/\w+)?)/.match(url)[1].to_s
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Parse series
|
98
|
+
#
|
99
|
+
# @return [Array<RelatonBib::Series>] series
|
100
|
+
#
|
101
|
+
def parse_series
|
102
|
+
title = RelatonBib::TypedTitleString.new content: "W3C #{type}"
|
103
|
+
[RelatonBib::Series.new(title: title, number: identifier(@sol.link.to_s))]
|
104
|
+
end
|
105
|
+
|
106
|
+
def type # rubocop:disable Metrics/MethodLength
|
107
|
+
@type ||= begin
|
108
|
+
sse = SPARQL.parse(%(
|
109
|
+
PREFIX : <http://www.w3.org/2001/02pd/rec54#>
|
110
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
111
|
+
SELECT ?type
|
112
|
+
WHERE {
|
113
|
+
{ <#{@sol.link}> rdf:type ?type }
|
114
|
+
}
|
115
|
+
))
|
116
|
+
tps = @fetcher.data.query(sse).map { |s| s.type.to_s.split("#").last }
|
117
|
+
tps.detect { |t| Scrapper::DOCTYPES.key?(t) }
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Parse doctype
|
123
|
+
#
|
124
|
+
# @return [Strinf] doctype
|
125
|
+
#
|
126
|
+
def parse_doctype
|
127
|
+
Scrapper::DOCTYPES[type]
|
128
|
+
end
|
129
|
+
|
130
|
+
def parse_date
|
131
|
+
[RelatonBib::BibliographicDate.new(type: "published", on: @sol.date.to_s)]
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Parse relation
|
136
|
+
#
|
137
|
+
# @return [Array<RelatonBib::DocumentRelation>] relation
|
138
|
+
#
|
139
|
+
def parse_relation # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
140
|
+
sse = SPARQL.parse(%(
|
141
|
+
PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
|
142
|
+
SELECT ?obsoletes
|
143
|
+
WHERE {
|
144
|
+
VALUES ?p { doc:obsoletes }
|
145
|
+
{ <#{@sol.link}> ?p ?obsoletes }
|
146
|
+
}
|
147
|
+
))
|
148
|
+
@fetcher.data.query(sse).order_by(:obsoletes).map do |r|
|
149
|
+
tp, url = r.to_h.first
|
150
|
+
fr = RelatonBib::LocalizedString.new pub_id(url.to_s)
|
151
|
+
bib = W3cBibliographicItem.new formattedref: fr
|
152
|
+
RelatonBib::DocumentRelation.new(type: tp.to_s, bibitem: bib)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Parse contributor
|
158
|
+
#
|
159
|
+
# @return [Array<RelatonBib::ContributionInfo>] contributor
|
160
|
+
#
|
161
|
+
def parse_contrib # rubocop:disable Metrics/MethodLength
|
162
|
+
sse = SPARQL.parse(%(
|
163
|
+
PREFIX : <http://www.w3.org/2001/02pd/rec54#>
|
164
|
+
PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
|
165
|
+
SELECT ?full_name
|
166
|
+
WHERE {
|
167
|
+
<#{@sol.link}> :editor/contact:fullName ?full_name
|
168
|
+
}
|
169
|
+
))
|
170
|
+
@fetcher.data.query(sse).order_by(:full_name).map do |ed|
|
171
|
+
cn = RelatonBib::LocalizedString.new(ed.full_name.to_s, "en", "Latn")
|
172
|
+
n = RelatonBib::FullName.new completename: cn
|
173
|
+
p = RelatonBib::Person.new name: n
|
174
|
+
RelatonBib::ContributionInfo.new entity: p, role: [type: "editor"]
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Parse editorialgroup
|
180
|
+
#
|
181
|
+
# @return [RelatonBib::EditorialGroup] editorialgroup
|
182
|
+
#
|
183
|
+
def parse_editorialgroup # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
184
|
+
sse = SPARQL.parse(%(
|
185
|
+
PREFIX org: <http://www.w3.org/2001/04/roadmap/org#>
|
186
|
+
PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
|
187
|
+
SELECT ?home_page
|
188
|
+
WHERE {
|
189
|
+
<#{@sol.link}> org:deliveredBy/contact:homePage ?home_page
|
190
|
+
}
|
191
|
+
))
|
192
|
+
res = @fetcher.data.query(sse).order_by(:home_page)
|
193
|
+
tc = res.each_with_object([]) do |edg, obj|
|
194
|
+
wg = @fetcher.group_names[edg.home_page.to_s.sub(/\/$/, "")]
|
195
|
+
if wg
|
196
|
+
rwg = RelatonBib::WorkGroup.new name: wg["name"]
|
197
|
+
obj << RelatonBib::TechnicalCommittee.new(rwg)
|
198
|
+
else
|
199
|
+
warn "Working group name not found for #{edg.home_page}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
RelatonBib::EditorialGroup.new tc
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
@@ -22,8 +22,8 @@ module RelatonW3c
|
|
22
22
|
# @param ref [String] reference to search
|
23
23
|
def initialize(ref)
|
24
24
|
%r{
|
25
|
-
^(W3C\s)?
|
26
|
-
(?<type>(CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
25
|
+
^(?:W3C\s)?
|
26
|
+
(?<type>(?:CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
27
27
|
Group\sNote|Proposed\sEdited\sRecommendation|Proposed\sRecommendation|
|
28
28
|
Recommendation|Retired|Working\sDraft))? # type
|
29
29
|
\s?
|
@@ -41,7 +41,7 @@ module RelatonW3c
|
|
41
41
|
# @param title_date [String]
|
42
42
|
# @param type [String]
|
43
43
|
# @return [Array<Hash>]
|
44
|
-
def from_yaml(title_date, type)
|
44
|
+
def from_yaml(title_date, type) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
45
45
|
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
46
|
title ||= title_date
|
47
47
|
result = data.select do |hit|
|
@@ -50,7 +50,7 @@ module RelatonW3c
|
|
50
50
|
type_date_filter(hit, type, date)
|
51
51
|
end
|
52
52
|
if result.empty?
|
53
|
-
result = data.select { |h| h["link"].split("/").last.match?
|
53
|
+
result = data.select { |h| h["link"].split("/").last.match?(/#{title}/) }
|
54
54
|
end
|
55
55
|
result.map { |h| Hit.new(h, self) }
|
56
56
|
end
|
@@ -109,7 +109,7 @@ module RelatonW3c
|
|
109
109
|
# @param type [String]
|
110
110
|
# @return [String]
|
111
111
|
def short_type(type)
|
112
|
-
tp = TYPES.select { |
|
112
|
+
tp = TYPES.select { |_, v| v == type }.keys
|
113
113
|
tp.first || type
|
114
114
|
end
|
115
115
|
|
@@ -137,7 +137,7 @@ module RelatonW3c
|
|
137
137
|
# fetch data form server and save it to file.
|
138
138
|
#
|
139
139
|
def fetch_data
|
140
|
-
resp = Net::HTTP.get_response URI.parse(DOMAIN
|
140
|
+
resp = Net::HTTP.get_response URI.parse("#{DOMAIN}/TR/")
|
141
141
|
# return if there aren't any changes since last fetching
|
142
142
|
return unless resp.code == "200"
|
143
143
|
|
@@ -153,7 +153,7 @@ module RelatonW3c
|
|
153
153
|
# @param h_el [Nokogiri::XML::Element]
|
154
154
|
# @param link [Nokogiri::XML::Element]
|
155
155
|
# @param pubdetails [Nokogiri::XML::Element]
|
156
|
-
def fetch_hit(h_el, link, pubdetails)
|
156
|
+
def fetch_hit(h_el, link, pubdetails) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
157
|
datepub = pubdetails.at("text()").text.match(/\d{4}-\d{2}-\d{2}/).to_s
|
158
158
|
editor = h_el.xpath("ul[@class='editorlist']/li").map { |e| e.text.strip }
|
159
159
|
keyword = h_el.xpath("ul[@class='taglist']/li").map { |e| e.text.strip }
|
@@ -4,11 +4,12 @@ module RelatonW3c
|
|
4
4
|
class Processor < Relaton::Processor
|
5
5
|
attr_reader :idtype
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize # rubocop:disable Lint/MissingSuper
|
8
8
|
@short = :relaton_w3c
|
9
9
|
@prefix = "W3C"
|
10
10
|
@defaultprefix = %r{^W3C\s}
|
11
11
|
@idtype = "W3C"
|
12
|
+
@datasets = %w[w3c-rdf]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
@@ -19,6 +20,18 @@ module RelatonW3c
|
|
19
20
|
::RelatonW3c::W3cBibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the documents from http://www.w3.org/2002/01/tr-automation/tr.rdf
|
25
|
+
#
|
26
|
+
# @param [String] _source source name
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonCalconnect::CcBibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -29,7 +42,7 @@ module RelatonW3c
|
|
29
42
|
# @return [RelatonIsoBib::CcBibliographicItem]
|
30
43
|
def hash_to_bib(hash)
|
31
44
|
item_hash = ::RelatonW3c::HashConverter.hash_to_bib(hash)
|
32
|
-
::RelatonW3c::W3cBibliographicItem.new
|
45
|
+
::RelatonW3c::W3cBibliographicItem.new(**item_hash)
|
33
46
|
end
|
34
47
|
|
35
48
|
# Returns hash of XML grammar
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module RelatonW3c
|
2
2
|
class Scrapper
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}.freeze
|
3
|
+
DOCTYPES = {
|
4
|
+
"CR" => "candidateRecommendation",
|
5
|
+
"NOTE" => "groupNote",
|
6
|
+
"PER" => "proposedEditedRecommendation",
|
7
|
+
"PR" => "proposedRecommendation",
|
8
|
+
"REC" => "recommendation",
|
9
|
+
"RET" => "retired",
|
10
|
+
"WD" => "workingDraft",
|
11
|
+
}.freeze
|
13
12
|
|
13
|
+
class << self
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
16
|
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
@@ -29,7 +29,7 @@ module RelatonW3c
|
|
29
29
|
doctype: fetch_doctype(hit, doc),
|
30
30
|
contributor: fetch_contributor(hit, doc),
|
31
31
|
relation: fetch_relation(doc),
|
32
|
-
keyword: hit["keyword"]
|
32
|
+
keyword: hit["keyword"],
|
33
33
|
)
|
34
34
|
end
|
35
35
|
|
@@ -53,7 +53,7 @@ module RelatonW3c
|
|
53
53
|
titles << { content: title.gsub(/\n/, " "), type: "main" }
|
54
54
|
end
|
55
55
|
subtitle = doc.at(
|
56
|
-
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
56
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]",
|
57
57
|
)&.text
|
58
58
|
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
59
59
|
end
|
@@ -62,7 +62,7 @@ module RelatonW3c
|
|
62
62
|
end
|
63
63
|
titles.map do |t|
|
64
64
|
title = RelatonBib::FormattedString.new(
|
65
|
-
content: t[:content], language: "en", script: "Latn"
|
65
|
+
content: t[:content], language: "en", script: "Latn",
|
66
66
|
)
|
67
67
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
68
68
|
end
|
@@ -88,7 +88,7 @@ module RelatonW3c
|
|
88
88
|
# @param hit [Hash]
|
89
89
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
90
90
|
# @return [Array<RelatonBib::BibliographicDate>]
|
91
|
-
def fetch_date(hit, doc)
|
91
|
+
def fetch_date(hit, doc) # rubocop:disable Metrics/CyclomaticComplexity
|
92
92
|
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
93
93
|
on ||= fetch_date1(doc) || fetch_date2(doc)
|
94
94
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
@@ -143,7 +143,7 @@ module RelatonW3c
|
|
143
143
|
end
|
144
144
|
mem
|
145
145
|
end
|
146
|
-
contribs.map { |c| contrib_info
|
146
|
+
contribs.map { |c| contrib_info(**c) }
|
147
147
|
else
|
148
148
|
hit["editor"].map do |ed|
|
149
149
|
contrib_info name: ed, role: [{ type: "editor" }]
|
@@ -162,7 +162,7 @@ module RelatonW3c
|
|
162
162
|
# @param element [Nokogiri::XML::Element]
|
163
163
|
# @param type [String]
|
164
164
|
# @return [Hash]
|
165
|
-
def parse_contrib(element, type)
|
165
|
+
def parse_contrib(element, type) # rubocop:disable Metrics/MethodLength
|
166
166
|
p = element.at("a")
|
167
167
|
return unless p
|
168
168
|
|
@@ -187,7 +187,7 @@ module RelatonW3c
|
|
187
187
|
name = RelatonBib::FullName.new completename: completename
|
188
188
|
af = []
|
189
189
|
if args[:org]
|
190
|
-
org = RelatonBib::Organization.new
|
190
|
+
org = RelatonBib::Organization.new(**args[:org])
|
191
191
|
af << RelatonBib::Affiliation.new(organization: org)
|
192
192
|
end
|
193
193
|
en = RelatonBib::Person.new name: name, url: args[:url], affiliation: af
|
data/lib/relaton_w3c/version.rb
CHANGED