relaton-w3c 1.11.3 → 1.11.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_w3c/data_fetcher.rb +0 -53
- data/lib/relaton_w3c/data_index.rb +49 -44
- data/lib/relaton_w3c/data_parser.rb +1 -1
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliography.rb +3 -3
- data/lib/relaton_w3c.rb +0 -3
- metadata +2 -5
- data/lib/relaton_w3c/hit.rb +0 -15
- data/lib/relaton_w3c/hit_collection.rb +0 -172
- data/lib/relaton_w3c/scrapper.rb +0 -218
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c318668fd5a7ef93b5275ac02cfadc9b91832eddeccf5996bc5ea65fc5272b1
|
4
|
+
data.tar.gz: 431ee27aec817b6d352e5410e2f4bf63710ecfcff47813e03d7e3302a7b42ecd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4254d592bcc1469a7a8773d641e192d07be6d4b0ce247e2b93be490717fc20dc4457718604a5353cf5b0cef6231f2920b03288b776f186df5a8645c5f453d5ed
|
7
|
+
data.tar.gz: 0c5b07d1efb4f1df3c1505c0b8d2558e7c9bd12d80d1fe7ea483e313f4462a1a1ae8010587f18772dc0288ab0e9573724a9c671d9985427adce5307354f20ee9
|
@@ -67,59 +67,6 @@ module RelatonW3c
|
|
67
67
|
@index.sort!.save
|
68
68
|
end
|
69
69
|
|
70
|
-
#
|
71
|
-
# Create index file
|
72
|
-
#
|
73
|
-
# def create_index
|
74
|
-
# index_file = "index-w3c.yaml"
|
75
|
-
# index_yaml = @index.sort do |a, b|
|
76
|
-
# compare_index_items a, b
|
77
|
-
# end.to_yaml
|
78
|
-
# File.write index_file, index_yaml, encoding: "UTF-8"
|
79
|
-
# end
|
80
|
-
|
81
|
-
#
|
82
|
-
# Compare index items
|
83
|
-
#
|
84
|
-
# @param [Hash] aid first item
|
85
|
-
# @param [Hash] bid second item
|
86
|
-
#
|
87
|
-
# @return [Integer] comparison result
|
88
|
-
#
|
89
|
-
# def compare_index_items(aid, bid) # rubocop:disable Metrics/AbcSize
|
90
|
-
# ret = aid[:code] <=> bid[:code]
|
91
|
-
# ret = stage_weight(bid[:stage]) <=> stage_weight(aid[:stage]) if ret.zero?
|
92
|
-
# ret = date_weight(bid[:date]) <=> date_weight(aid[:date]) if ret.zero?
|
93
|
-
# # ret = aid[:type] <=> bid[:type] if ret.zero?
|
94
|
-
# ret
|
95
|
-
# end
|
96
|
-
|
97
|
-
#
|
98
|
-
# Weight of stage
|
99
|
-
#
|
100
|
-
# @param [String, nil] stage stage
|
101
|
-
#
|
102
|
-
# @return [Integer] weight
|
103
|
-
#
|
104
|
-
# def stage_weight(stage)
|
105
|
-
# return DataParser::STAGES.size if stage.nil?
|
106
|
-
|
107
|
-
# DataParser::STAGES.keys.index(stage)
|
108
|
-
# end
|
109
|
-
|
110
|
-
#
|
111
|
-
# Weight of date
|
112
|
-
#
|
113
|
-
# @param [String] date date
|
114
|
-
#
|
115
|
-
# @return [String] weight
|
116
|
-
#
|
117
|
-
# def date_weight(date)
|
118
|
-
# return "99999999" if date.nil?
|
119
|
-
|
120
|
-
# date
|
121
|
-
# end
|
122
|
-
|
123
70
|
#
|
124
71
|
# Query RDF source for documents
|
125
72
|
#
|
@@ -13,25 +13,6 @@ module RelatonW3c
|
|
13
13
|
@index = index
|
14
14
|
end
|
15
15
|
|
16
|
-
#
|
17
|
-
# Create index from a GitHub repository
|
18
|
-
#
|
19
|
-
# @return [RelatonW3c::DataIndex] data index
|
20
|
-
#
|
21
|
-
def self.create_from_repo # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
22
|
-
resp = Zip::InputStream.new URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
|
23
|
-
zip = resp.get_next_entry
|
24
|
-
|
25
|
-
# Newer versions of Psych uses the `permitted_classes:` parameter
|
26
|
-
index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
|
27
|
-
YAML.safe_load(zip.get_input_stream.read, permitted_classes: [Symbol])
|
28
|
-
else
|
29
|
-
YAML.safe_load(zip.get_input_stream.read, [Symbol])
|
30
|
-
end
|
31
|
-
|
32
|
-
DataIndex.new index: index
|
33
|
-
end
|
34
|
-
|
35
16
|
#
|
36
17
|
# Add document to index
|
37
18
|
#
|
@@ -39,7 +20,9 @@ module RelatonW3c
|
|
39
20
|
# @param [String] file path to document file
|
40
21
|
#
|
41
22
|
def add(docnumber, file)
|
42
|
-
|
23
|
+
dnparts = self.class.docnumber_to_parts docnumber
|
24
|
+
dnparts[:file] = file
|
25
|
+
@index << dnparts
|
43
26
|
end
|
44
27
|
|
45
28
|
#
|
@@ -67,11 +50,14 @@ module RelatonW3c
|
|
67
50
|
# @return [String] document's filename
|
68
51
|
#
|
69
52
|
def search(ref) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
70
|
-
dparts = docnumber_to_parts(ref)
|
53
|
+
dparts = self.class.docnumber_to_parts(ref)
|
54
|
+
return if dparts[:code].nil?
|
55
|
+
|
71
56
|
@index.detect do |parts|
|
72
57
|
parts[:code].match?(/^#{Regexp.escape dparts[:code]}/i) &&
|
73
58
|
(dparts[:stage].nil? || dparts[:stage].casecmp?(parts[:stage])) &&
|
74
|
-
(dparts[:type].nil? || dparts[:type].casecmp?(parts[:type])
|
59
|
+
(dparts[:type].nil? || dparts[:type].casecmp?(parts[:type]) ||
|
60
|
+
(parts[:type].nil? && dparts[:type] == "TR")) &&
|
75
61
|
(dparts[:date].nil? || dparts[:date] == parts[:date]) &&
|
76
62
|
(dparts[:suff].nil? || dparts[:suff].casecmp?(parts[:suff]))
|
77
63
|
end&.fetch(:file)
|
@@ -119,28 +105,47 @@ module RelatonW3c
|
|
119
105
|
date
|
120
106
|
end
|
121
107
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
(
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
108
|
+
class << self
|
109
|
+
#
|
110
|
+
# Create index from a GitHub repository
|
111
|
+
#
|
112
|
+
# @return [RelatonW3c::DataIndex] data index
|
113
|
+
#
|
114
|
+
def create_from_repo # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
115
|
+
resp = Zip::InputStream.new URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
|
116
|
+
zip = resp.get_next_entry
|
117
|
+
|
118
|
+
# Newer versions of Psych uses the `permitted_classes:` parameter
|
119
|
+
index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
|
120
|
+
YAML.safe_load(zip.get_input_stream.read, permitted_classes: [Symbol])
|
121
|
+
else
|
122
|
+
YAML.safe_load(zip.get_input_stream.read, [Symbol])
|
123
|
+
end
|
124
|
+
|
125
|
+
DataIndex.new index: index
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Parse document number to parts
|
130
|
+
#
|
131
|
+
# @param [String] docnumber document number
|
132
|
+
#
|
133
|
+
# @return [Hash{Symbol=>String}] document parts
|
134
|
+
#
|
135
|
+
def docnumber_to_parts(docnumber) # rubocop:disable Metrics/MethodLength
|
136
|
+
%r{
|
137
|
+
^(?:(?:(?<stage>WD|CRD|CR|PR|PER|REC|SPSD|OBSL|RET)|(?<type>D?NOTE|TR))-)?
|
138
|
+
(?<code>\w+(?:[+-][\w.]+)*?)
|
139
|
+
(?:-(?<date>\d{8}|\d{6}))?
|
140
|
+
(?:/(?<suff>\w+))?$
|
141
|
+
}xi =~ docnumber
|
142
|
+
entry = { code: code }
|
143
|
+
entry[:stage] = stage if stage
|
144
|
+
entry[:type] = type if type
|
145
|
+
entry[:date] = date if date
|
146
|
+
entry[:suff] = suff if suff
|
147
|
+
entry
|
148
|
+
end
|
144
149
|
end
|
145
150
|
end
|
146
151
|
end
|
@@ -169,7 +169,7 @@ module RelatonW3c
|
|
169
169
|
#
|
170
170
|
def type
|
171
171
|
# thre are many types, we need to find the right one
|
172
|
-
@type ||= types_stages&.detect { |t| USED_TYPES.include?(t) }
|
172
|
+
@type ||= types_stages&.detect { |t| USED_TYPES.include?(t) } || "technicalReport"
|
173
173
|
end
|
174
174
|
|
175
175
|
#
|
data/lib/relaton_w3c/version.rb
CHANGED
@@ -9,10 +9,10 @@ module RelatonW3c
|
|
9
9
|
|
10
10
|
class << self
|
11
11
|
# @param text [String]
|
12
|
-
# @return [RelatonW3c::
|
13
|
-
def search(text) # rubocop:disable Metrics/MethodLength
|
12
|
+
# @return [RelatonW3c::W3cBibliographicItem]
|
13
|
+
def search(text) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
14
14
|
ref = DataParser.parse_identifier text.sub(/^W3C\s/, "")
|
15
|
-
file = DataIndex.create_from_repo.search(
|
15
|
+
file = DataIndex.create_from_repo.search ref.gsub(" ", "-").squeeze("-")
|
16
16
|
return unless file
|
17
17
|
|
18
18
|
url = "#{SOURCE}#{file}"
|
data/lib/relaton_w3c.rb
CHANGED
@@ -2,9 +2,6 @@ require "relaton_bib"
|
|
2
2
|
require "relaton_w3c/version"
|
3
3
|
require "relaton_w3c/w3c_bibliography"
|
4
4
|
require "relaton_w3c/w3c_bibliographic_item"
|
5
|
-
# require "relaton_w3c/hit_collection"
|
6
|
-
# require "relaton_w3c/hit"
|
7
|
-
# require "relaton_w3c/scrapper"
|
8
5
|
require "relaton_w3c/xml_parser"
|
9
6
|
require "relaton_w3c/bibxml_parser"
|
10
7
|
require "relaton_w3c/hash_converter"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.11.
|
4
|
+
version: 1.11.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -247,10 +247,7 @@ files:
|
|
247
247
|
- lib/relaton_w3c/data_index.rb
|
248
248
|
- lib/relaton_w3c/data_parser.rb
|
249
249
|
- lib/relaton_w3c/hash_converter.rb
|
250
|
-
- lib/relaton_w3c/hit.rb
|
251
|
-
- lib/relaton_w3c/hit_collection.rb
|
252
250
|
- lib/relaton_w3c/processor.rb
|
253
|
-
- lib/relaton_w3c/scrapper.rb
|
254
251
|
- lib/relaton_w3c/version.rb
|
255
252
|
- lib/relaton_w3c/w3c_bibliographic_item.rb
|
256
253
|
- lib/relaton_w3c/w3c_bibliography.rb
|
data/lib/relaton_w3c/hit.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module RelatonW3c
|
4
|
-
# Hit.
|
5
|
-
class Hit < RelatonBib::Hit
|
6
|
-
#
|
7
|
-
# Parse page.
|
8
|
-
#
|
9
|
-
# @param lang [String, NilClass]
|
10
|
-
# @return [RelatonW3c::W3cBibliographicItem]
|
11
|
-
def fetch(_lang = nil)
|
12
|
-
@fetch ||= Scrapper.parse_page hit
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,172 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "fileutils"
|
4
|
-
require "yaml"
|
5
|
-
|
6
|
-
module RelatonW3c
|
7
|
-
# Page of hit collection.
|
8
|
-
class HitCollection < RelatonBib::HitCollection
|
9
|
-
TYPES = {
|
10
|
-
"CR" => "Candidate Recommendation",
|
11
|
-
"NOTE" => "Group Note",
|
12
|
-
"PER" => "Proposed Edited Recommendation",
|
13
|
-
"PR" => "Proposed Recommendation",
|
14
|
-
"REC" => "Recommendation",
|
15
|
-
"RET" => "Retired",
|
16
|
-
"WD" => "Working Draft",
|
17
|
-
}.freeze
|
18
|
-
DOMAIN = "https://www.w3.org"
|
19
|
-
DATADIR = File.expand_path(".relaton/w3c", Dir.home).freeze
|
20
|
-
DATAFILE = File.expand_path("bibliography.yml", DATADIR).freeze
|
21
|
-
|
22
|
-
# @param ref [String] reference to search
|
23
|
-
def initialize(ref)
|
24
|
-
%r{
|
25
|
-
^(?:W3C\s)?
|
26
|
-
(?<type>(?:CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
27
|
-
Group\sNote|Proposed\sEdited\sRecommendation|Proposed\sRecommendation|
|
28
|
-
Recommendation|Retired|Working\sDraft))? # type
|
29
|
-
\s?
|
30
|
-
(?<title_date>.+) # title_date
|
31
|
-
}x =~ ref
|
32
|
-
super
|
33
|
-
@array = from_yaml title_date, type
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
#
|
39
|
-
# Fetch data form yaml
|
40
|
-
#
|
41
|
-
# @param title_date [String]
|
42
|
-
# @param type [String]
|
43
|
-
# @return [Array<Hash>]
|
44
|
-
def from_yaml(title_date, type) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
45
|
-
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
|
-
title ||= title_date
|
47
|
-
result = data.select do |hit|
|
48
|
-
(hit["title"].casecmp?(title) ||
|
49
|
-
hit["link"].split("/").last.match?(/-#{title}-/)) &&
|
50
|
-
type_date_filter(hit, type, date)
|
51
|
-
end
|
52
|
-
if result.empty?
|
53
|
-
result = data.select { |h| h["link"].split("/").last.match?(/#{title}/) }
|
54
|
-
end
|
55
|
-
result.map { |h| Hit.new(h, self) }
|
56
|
-
end
|
57
|
-
|
58
|
-
# @param hit [Hash]
|
59
|
-
# @param type [String]
|
60
|
-
# @param date [String]
|
61
|
-
# @return [TrueClass, FalseClass]
|
62
|
-
def type_date_filter(hit, type, date) # rubocop:disable Metrics/AbcSize
|
63
|
-
if (type && hit["type"] != short_type(type)) || (date && hit["date"] != date)
|
64
|
-
history = get_history hit, type, date
|
65
|
-
return false unless history.any?
|
66
|
-
|
67
|
-
hit["type"] = short_type type
|
68
|
-
hit["datepub"] = history.first.at("td").text
|
69
|
-
hit["link"] = history.first.at("a")[:href]
|
70
|
-
end
|
71
|
-
true
|
72
|
-
end
|
73
|
-
|
74
|
-
# @param hit [Hash]
|
75
|
-
# @param type [String]
|
76
|
-
# @param date [String]
|
77
|
-
# @return [Array<Nokogiri::XML::Element>, Nokogiri::HTML::NodeSet]
|
78
|
-
def get_history(hit, type, date)
|
79
|
-
resp = Net::HTTP.get URI.parse(HitCollection::DOMAIN + hit["history"])
|
80
|
-
history_doc = Nokogiri::HTML resp
|
81
|
-
history = history_doc.xpath(
|
82
|
-
"//table//a[contains(.,'#{long_type(type)}')]/../..",
|
83
|
-
)
|
84
|
-
return filter_history_by_date(history, history_doc, type, date) if date
|
85
|
-
|
86
|
-
history
|
87
|
-
end
|
88
|
-
|
89
|
-
# @param history [Nokogiri::XML::NodeSet]
|
90
|
-
# @param history_doc [Nokogiri::HTML::NodeSet]
|
91
|
-
# @param type [String]
|
92
|
-
# @param date [String]
|
93
|
-
# @return [Array<Nokogiri::XML::Element>, Nokogiri::HTML::NodeSet]
|
94
|
-
def filter_history_by_date(history, history_doc, type, date)
|
95
|
-
if type
|
96
|
-
history.select do |h|
|
97
|
-
h.at("td[@class='table_datecol']").text == date
|
98
|
-
end
|
99
|
-
else
|
100
|
-
history_doc.xpath(
|
101
|
-
"//table//td[@class='table_datecol'][.='#{date}']/..",
|
102
|
-
)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
#
|
107
|
-
# Convetr long type name to short
|
108
|
-
#
|
109
|
-
# @param type [String]
|
110
|
-
# @return [String]
|
111
|
-
def short_type(type)
|
112
|
-
tp = TYPES.select { |_, v| v == type }.keys
|
113
|
-
tp.first || type
|
114
|
-
end
|
115
|
-
|
116
|
-
#
|
117
|
-
# Convert shot type name to long
|
118
|
-
#
|
119
|
-
# @param [String]
|
120
|
-
# @return [String]
|
121
|
-
def long_type(type)
|
122
|
-
TYPES[type] || type
|
123
|
-
end
|
124
|
-
|
125
|
-
#
|
126
|
-
# Fetches YAML data
|
127
|
-
#
|
128
|
-
# @return [Hash]
|
129
|
-
def data
|
130
|
-
FileUtils.mkdir_p DATADIR
|
131
|
-
ctime = File.ctime DATAFILE if File.exist? DATAFILE
|
132
|
-
fetch_data if !ctime || ctime.to_date < Date.today
|
133
|
-
@data ||= YAML.safe_load File.read(DATAFILE, encoding: "UTF-8")
|
134
|
-
end
|
135
|
-
|
136
|
-
#
|
137
|
-
# fetch data form server and save it to file.
|
138
|
-
#
|
139
|
-
def fetch_data
|
140
|
-
resp = Net::HTTP.get_response URI.parse("#{DOMAIN}/TR/")
|
141
|
-
# return if there aren't any changes since last fetching
|
142
|
-
return unless resp.code == "200"
|
143
|
-
|
144
|
-
doc = Nokogiri::HTML resp.body
|
145
|
-
@data = doc.xpath("//ul[@id='container']/li").map do |h_el|
|
146
|
-
link = h_el.at("h2/a")
|
147
|
-
pubdetails = h_el.at("p[@class='pubdetails']")
|
148
|
-
fetch_hit h_el, link, pubdetails
|
149
|
-
end
|
150
|
-
File.write DATAFILE, @data.to_yaml, encoding: "UTF-8"
|
151
|
-
end
|
152
|
-
|
153
|
-
# @param h_el [Nokogiri::XML::Element]
|
154
|
-
# @param link [Nokogiri::XML::Element]
|
155
|
-
# @param pubdetails [Nokogiri::XML::Element]
|
156
|
-
def fetch_hit(h_el, link, pubdetails) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
|
-
datepub = pubdetails.at("text()").text.match(/\d{4}-\d{2}-\d{2}/).to_s
|
158
|
-
editor = h_el.xpath("ul[@class='editorlist']/li").map { |e| e.text.strip }
|
159
|
-
keyword = h_el.xpath("ul[@class='taglist']/li").map { |e| e.text.strip }
|
160
|
-
{
|
161
|
-
"title" => link.text.gsub("\u00a0", " "),
|
162
|
-
"link" => link[:href],
|
163
|
-
"type" => h_el.at("div").text.upcase,
|
164
|
-
"workgroup" => h_el.xpath("p[@class='deliverer']").map(&:text),
|
165
|
-
"datepub" => datepub,
|
166
|
-
"history" => pubdetails.at("a[text()='History']")[:href],
|
167
|
-
"editor" => editor,
|
168
|
-
"keyword" => keyword,
|
169
|
-
}
|
170
|
-
end
|
171
|
-
end
|
172
|
-
end
|
data/lib/relaton_w3c/scrapper.rb
DELETED
@@ -1,218 +0,0 @@
|
|
1
|
-
module RelatonW3c
|
2
|
-
class Scrapper
|
3
|
-
DOCTYPES = {
|
4
|
-
"CR" => "candidateRecommendation",
|
5
|
-
"NOTE" => "groupNote",
|
6
|
-
"PER" => "proposedEditedRecommendation",
|
7
|
-
"PR" => "proposedRecommendation",
|
8
|
-
"REC" => "recommendation",
|
9
|
-
"RET" => "retired",
|
10
|
-
"WD" => "workingDraft",
|
11
|
-
}.freeze
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# @param hit [Hash]
|
15
|
-
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
|
-
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
|
-
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
|
-
W3cBibliographicItem.new(
|
20
|
-
type: "standard",
|
21
|
-
docid: fetch_docid(hit),
|
22
|
-
fetched: Date.today.to_s,
|
23
|
-
language: ["en"],
|
24
|
-
script: ["Latn"],
|
25
|
-
title: fetch_title(hit, doc),
|
26
|
-
abstract: fetch_abstract(doc),
|
27
|
-
link: fetch_link(hit),
|
28
|
-
date: fetch_date(hit, doc),
|
29
|
-
doctype: fetch_doctype(hit, doc),
|
30
|
-
contributor: fetch_contributor(hit, doc),
|
31
|
-
relation: fetch_relation(doc),
|
32
|
-
keyword: hit["keyword"],
|
33
|
-
)
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
# @param hit [Hash]
|
39
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
40
|
-
def fetch_docid(hit)
|
41
|
-
id = hit["link"].split("/").last
|
42
|
-
[RelatonBib::DocumentIdentifier.new(id: id, type: "W3C", primary: true)]
|
43
|
-
end
|
44
|
-
|
45
|
-
# @param hit [Hash]
|
46
|
-
# @param doc [Nokogiri::HTML::Document]
|
47
|
-
# @return [Array<RelatonBib::TypedTitleString>]
|
48
|
-
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
49
|
-
titles = []
|
50
|
-
if doc
|
51
|
-
title = doc.at("//*[contains(@id, 'title')]")&.text
|
52
|
-
if title && !title.empty?
|
53
|
-
titles << { content: title.gsub(/\n/, " "), type: "main" }
|
54
|
-
end
|
55
|
-
subtitle = doc.at(
|
56
|
-
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]",
|
57
|
-
)&.text
|
58
|
-
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
59
|
-
end
|
60
|
-
if titles.empty? && hit["title"]
|
61
|
-
titles << { content: hit["title"], type: "main" }
|
62
|
-
end
|
63
|
-
titles.map do |t|
|
64
|
-
title = RelatonBib::FormattedString.new(
|
65
|
-
content: t[:content], language: "en", script: "Latn",
|
66
|
-
)
|
67
|
-
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
72
|
-
# @return [Array<RelatonBib::FormattedString>]
|
73
|
-
def fetch_abstract(doc)
|
74
|
-
return [] unless doc
|
75
|
-
|
76
|
-
content = doc.at("//h2[.='Abstract']/following-sibling::p",
|
77
|
-
"//div[@class='abstract']/p").text
|
78
|
-
[RelatonBib::FormattedString.new(content: content, language: "en",
|
79
|
-
script: "Latn")]
|
80
|
-
end
|
81
|
-
|
82
|
-
# @param hit [Hash]
|
83
|
-
# @return [Array<RelatonBib::TypedUri>]
|
84
|
-
def fetch_link(hit)
|
85
|
-
[RelatonBib::TypedUri.new(type: "src", content: hit["link"])]
|
86
|
-
end
|
87
|
-
|
88
|
-
# @param hit [Hash]
|
89
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
90
|
-
# @return [Array<RelatonBib::BibliographicDate>]
|
91
|
-
def fetch_date(hit, doc) # rubocop:disable Metrics/CyclomaticComplexity
|
92
|
-
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
93
|
-
on ||= fetch_date1(doc) || fetch_date2(doc)
|
94
|
-
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
95
|
-
end
|
96
|
-
|
97
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
|
-
# @return [String]
|
99
|
-
def fetch_date1(doc)
|
100
|
-
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
101
|
-
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
102
|
-
end
|
103
|
-
|
104
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
105
|
-
# @return [String]
|
106
|
-
def fetch_date2(doc)
|
107
|
-
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
108
|
-
return unless d
|
109
|
-
|
110
|
-
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
111
|
-
end
|
112
|
-
|
113
|
-
# @param hit [Hash]
|
114
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
115
|
-
# @return [String]
|
116
|
-
def fetch_doctype(hit, doc)
|
117
|
-
if hit["type"]
|
118
|
-
DOCTYPES[hit["type"]]
|
119
|
-
elsif doc
|
120
|
-
type = HitCollection::TYPES.detect do |_k, v|
|
121
|
-
doc.at("//h2[contains(., '#{v}')]/time[@datetime]")
|
122
|
-
end
|
123
|
-
DOCTYPES[type&.first]
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
# @param hit [Hash]
|
128
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
129
|
-
# @return [Array<RelatonBib::ContributionInfo>]
|
130
|
-
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
131
|
-
if doc
|
132
|
-
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
133
|
-
c = parse_contrib ed, "editor"
|
134
|
-
mem << c if c
|
135
|
-
mem
|
136
|
-
end
|
137
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
138
|
-
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
139
|
-
if ed
|
140
|
-
ed[:role] << { type: "author" }
|
141
|
-
else
|
142
|
-
mem << parse_contrib(ath, "author")
|
143
|
-
end
|
144
|
-
mem
|
145
|
-
end
|
146
|
-
contribs.map { |c| contrib_info(**c) }
|
147
|
-
else
|
148
|
-
hit["editor"].map do |ed|
|
149
|
-
contrib_info name: ed, role: [{ type: "editor" }]
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# @param doc [Nokogiri::NTML::Document]
|
155
|
-
# @param type [String]
|
156
|
-
# @return [Array<Nokogiri::XML::Element]
|
157
|
-
def find_contribs(doc, type)
|
158
|
-
doc.xpath("//dt[contains(.,'#{type}')]/following-sibling::dd"\
|
159
|
-
"[preceding-sibling::dt[1][contains(.,'#{type}')]]")
|
160
|
-
end
|
161
|
-
|
162
|
-
# @param element [Nokogiri::XML::Element]
|
163
|
-
# @param type [String]
|
164
|
-
# @return [Hash]
|
165
|
-
def parse_contrib(element, type) # rubocop:disable Metrics/MethodLength
|
166
|
-
p = element.at("a")
|
167
|
-
return unless p
|
168
|
-
|
169
|
-
contrib = {
|
170
|
-
name: p.text,
|
171
|
-
url: p[:href],
|
172
|
-
role: [{ type: type }],
|
173
|
-
id: element["data-editor-id"],
|
174
|
-
}
|
175
|
-
org = element.at("a[2]")
|
176
|
-
contrib[:org] = { name: org.text, url: org[:href] } if org
|
177
|
-
contrib
|
178
|
-
end
|
179
|
-
|
180
|
-
# @param name [String]
|
181
|
-
# @param url [String, NilClass]
|
182
|
-
# @param role [Array<Hash>]
|
183
|
-
# @parma org [Hash]
|
184
|
-
# @return [RelatonBib::ContributionInfo]
|
185
|
-
def contrib_info(**args)
|
186
|
-
completename = RelatonBib::LocalizedString.new(args[:name])
|
187
|
-
name = RelatonBib::FullName.new completename: completename
|
188
|
-
af = []
|
189
|
-
if args[:org]
|
190
|
-
org = RelatonBib::Organization.new(**args[:org])
|
191
|
-
af << RelatonBib::Affiliation.new(organization: org)
|
192
|
-
end
|
193
|
-
en = RelatonBib::Person.new name: name, url: args[:url], affiliation: af
|
194
|
-
RelatonBib::ContributionInfo.new entity: en, role: args[:role]
|
195
|
-
end
|
196
|
-
|
197
|
-
# @param doc [Nokogiri::HTML::Document]
|
198
|
-
# @return [Array<RelatonBib::DocumentRelation>]
|
199
|
-
def fetch_relation(doc)
|
200
|
-
return [] unless doc && (link = recommendation_link(doc))
|
201
|
-
|
202
|
-
hit = { "link" => link }
|
203
|
-
item = parse_page hit
|
204
|
-
[RelatonBib::DocumentRelation.new(type: "obsoletedBy", bibitem: item)]
|
205
|
-
end
|
206
|
-
|
207
|
-
# @param doc [Nokogiri::HTML::Document]
|
208
|
-
# @return [String, NilClass]
|
209
|
-
def recommendation_link(doc)
|
210
|
-
recom = doc.at("//dt[.='Latest Recommendation:']",
|
211
|
-
"//dt[.='Previous Recommendation:']")
|
212
|
-
return unless recom
|
213
|
-
|
214
|
-
recom.at("./following-sibling::dd/a")[:href]
|
215
|
-
end
|
216
|
-
end
|
217
|
-
end
|
218
|
-
end
|