relaton-w3c 1.11.3 → 1.11.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_w3c/data_fetcher.rb +0 -53
- data/lib/relaton_w3c/data_index.rb +49 -44
- data/lib/relaton_w3c/data_parser.rb +1 -1
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliography.rb +3 -3
- data/lib/relaton_w3c.rb +0 -3
- metadata +2 -5
- data/lib/relaton_w3c/hit.rb +0 -15
- data/lib/relaton_w3c/hit_collection.rb +0 -172
- data/lib/relaton_w3c/scrapper.rb +0 -218
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c318668fd5a7ef93b5275ac02cfadc9b91832eddeccf5996bc5ea65fc5272b1
|
4
|
+
data.tar.gz: 431ee27aec817b6d352e5410e2f4bf63710ecfcff47813e03d7e3302a7b42ecd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4254d592bcc1469a7a8773d641e192d07be6d4b0ce247e2b93be490717fc20dc4457718604a5353cf5b0cef6231f2920b03288b776f186df5a8645c5f453d5ed
|
7
|
+
data.tar.gz: 0c5b07d1efb4f1df3c1505c0b8d2558e7c9bd12d80d1fe7ea483e313f4462a1a1ae8010587f18772dc0288ab0e9573724a9c671d9985427adce5307354f20ee9
|
@@ -67,59 +67,6 @@ module RelatonW3c
|
|
67
67
|
@index.sort!.save
|
68
68
|
end
|
69
69
|
|
70
|
-
#
|
71
|
-
# Create index file
|
72
|
-
#
|
73
|
-
# def create_index
|
74
|
-
# index_file = "index-w3c.yaml"
|
75
|
-
# index_yaml = @index.sort do |a, b|
|
76
|
-
# compare_index_items a, b
|
77
|
-
# end.to_yaml
|
78
|
-
# File.write index_file, index_yaml, encoding: "UTF-8"
|
79
|
-
# end
|
80
|
-
|
81
|
-
#
|
82
|
-
# Compare index items
|
83
|
-
#
|
84
|
-
# @param [Hash] aid first item
|
85
|
-
# @param [Hash] bid second item
|
86
|
-
#
|
87
|
-
# @return [Integer] comparison result
|
88
|
-
#
|
89
|
-
# def compare_index_items(aid, bid) # rubocop:disable Metrics/AbcSize
|
90
|
-
# ret = aid[:code] <=> bid[:code]
|
91
|
-
# ret = stage_weight(bid[:stage]) <=> stage_weight(aid[:stage]) if ret.zero?
|
92
|
-
# ret = date_weight(bid[:date]) <=> date_weight(aid[:date]) if ret.zero?
|
93
|
-
# # ret = aid[:type] <=> bid[:type] if ret.zero?
|
94
|
-
# ret
|
95
|
-
# end
|
96
|
-
|
97
|
-
#
|
98
|
-
# Weight of stage
|
99
|
-
#
|
100
|
-
# @param [String, nil] stage stage
|
101
|
-
#
|
102
|
-
# @return [Integer] weight
|
103
|
-
#
|
104
|
-
# def stage_weight(stage)
|
105
|
-
# return DataParser::STAGES.size if stage.nil?
|
106
|
-
|
107
|
-
# DataParser::STAGES.keys.index(stage)
|
108
|
-
# end
|
109
|
-
|
110
|
-
#
|
111
|
-
# Weight of date
|
112
|
-
#
|
113
|
-
# @param [String] date date
|
114
|
-
#
|
115
|
-
# @return [String] weight
|
116
|
-
#
|
117
|
-
# def date_weight(date)
|
118
|
-
# return "99999999" if date.nil?
|
119
|
-
|
120
|
-
# date
|
121
|
-
# end
|
122
|
-
|
123
70
|
#
|
124
71
|
# Query RDF source for documents
|
125
72
|
#
|
@@ -13,25 +13,6 @@ module RelatonW3c
|
|
13
13
|
@index = index
|
14
14
|
end
|
15
15
|
|
16
|
-
#
|
17
|
-
# Create index from a GitHub repository
|
18
|
-
#
|
19
|
-
# @return [RelatonW3c::DataIndex] data index
|
20
|
-
#
|
21
|
-
def self.create_from_repo # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
22
|
-
resp = Zip::InputStream.new URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
|
23
|
-
zip = resp.get_next_entry
|
24
|
-
|
25
|
-
# Newer versions of Psych uses the `permitted_classes:` parameter
|
26
|
-
index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
|
27
|
-
YAML.safe_load(zip.get_input_stream.read, permitted_classes: [Symbol])
|
28
|
-
else
|
29
|
-
YAML.safe_load(zip.get_input_stream.read, [Symbol])
|
30
|
-
end
|
31
|
-
|
32
|
-
DataIndex.new index: index
|
33
|
-
end
|
34
|
-
|
35
16
|
#
|
36
17
|
# Add document to index
|
37
18
|
#
|
@@ -39,7 +20,9 @@ module RelatonW3c
|
|
39
20
|
# @param [String] file path to document file
|
40
21
|
#
|
41
22
|
def add(docnumber, file)
|
42
|
-
|
23
|
+
dnparts = self.class.docnumber_to_parts docnumber
|
24
|
+
dnparts[:file] = file
|
25
|
+
@index << dnparts
|
43
26
|
end
|
44
27
|
|
45
28
|
#
|
@@ -67,11 +50,14 @@ module RelatonW3c
|
|
67
50
|
# @return [String] document's filename
|
68
51
|
#
|
69
52
|
def search(ref) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
70
|
-
dparts = docnumber_to_parts(ref)
|
53
|
+
dparts = self.class.docnumber_to_parts(ref)
|
54
|
+
return if dparts[:code].nil?
|
55
|
+
|
71
56
|
@index.detect do |parts|
|
72
57
|
parts[:code].match?(/^#{Regexp.escape dparts[:code]}/i) &&
|
73
58
|
(dparts[:stage].nil? || dparts[:stage].casecmp?(parts[:stage])) &&
|
74
|
-
(dparts[:type].nil? || dparts[:type].casecmp?(parts[:type])
|
59
|
+
(dparts[:type].nil? || dparts[:type].casecmp?(parts[:type]) ||
|
60
|
+
(parts[:type].nil? && dparts[:type] == "TR")) &&
|
75
61
|
(dparts[:date].nil? || dparts[:date] == parts[:date]) &&
|
76
62
|
(dparts[:suff].nil? || dparts[:suff].casecmp?(parts[:suff]))
|
77
63
|
end&.fetch(:file)
|
@@ -119,28 +105,47 @@ module RelatonW3c
|
|
119
105
|
date
|
120
106
|
end
|
121
107
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
(
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
108
|
+
class << self
|
109
|
+
#
|
110
|
+
# Create index from a GitHub repository
|
111
|
+
#
|
112
|
+
# @return [RelatonW3c::DataIndex] data index
|
113
|
+
#
|
114
|
+
def create_from_repo # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
115
|
+
resp = Zip::InputStream.new URI("#{W3cBibliography::SOURCE}index-w3c.zip").open
|
116
|
+
zip = resp.get_next_entry
|
117
|
+
|
118
|
+
# Newer versions of Psych uses the `permitted_classes:` parameter
|
119
|
+
index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
|
120
|
+
YAML.safe_load(zip.get_input_stream.read, permitted_classes: [Symbol])
|
121
|
+
else
|
122
|
+
YAML.safe_load(zip.get_input_stream.read, [Symbol])
|
123
|
+
end
|
124
|
+
|
125
|
+
DataIndex.new index: index
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Parse document number to parts
|
130
|
+
#
|
131
|
+
# @param [String] docnumber document number
|
132
|
+
#
|
133
|
+
# @return [Hash{Symbol=>String}] document parts
|
134
|
+
#
|
135
|
+
def docnumber_to_parts(docnumber) # rubocop:disable Metrics/MethodLength
|
136
|
+
%r{
|
137
|
+
^(?:(?:(?<stage>WD|CRD|CR|PR|PER|REC|SPSD|OBSL|RET)|(?<type>D?NOTE|TR))-)?
|
138
|
+
(?<code>\w+(?:[+-][\w.]+)*?)
|
139
|
+
(?:-(?<date>\d{8}|\d{6}))?
|
140
|
+
(?:/(?<suff>\w+))?$
|
141
|
+
}xi =~ docnumber
|
142
|
+
entry = { code: code }
|
143
|
+
entry[:stage] = stage if stage
|
144
|
+
entry[:type] = type if type
|
145
|
+
entry[:date] = date if date
|
146
|
+
entry[:suff] = suff if suff
|
147
|
+
entry
|
148
|
+
end
|
144
149
|
end
|
145
150
|
end
|
146
151
|
end
|
@@ -169,7 +169,7 @@ module RelatonW3c
|
|
169
169
|
#
|
170
170
|
def type
|
171
171
|
# thre are many types, we need to find the right one
|
172
|
-
@type ||= types_stages&.detect { |t| USED_TYPES.include?(t) }
|
172
|
+
@type ||= types_stages&.detect { |t| USED_TYPES.include?(t) } || "technicalReport"
|
173
173
|
end
|
174
174
|
|
175
175
|
#
|
data/lib/relaton_w3c/version.rb
CHANGED
@@ -9,10 +9,10 @@ module RelatonW3c
|
|
9
9
|
|
10
10
|
class << self
|
11
11
|
# @param text [String]
|
12
|
-
# @return [RelatonW3c::
|
13
|
-
def search(text) # rubocop:disable Metrics/MethodLength
|
12
|
+
# @return [RelatonW3c::W3cBibliographicItem]
|
13
|
+
def search(text) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
14
14
|
ref = DataParser.parse_identifier text.sub(/^W3C\s/, "")
|
15
|
-
file = DataIndex.create_from_repo.search(
|
15
|
+
file = DataIndex.create_from_repo.search ref.gsub(" ", "-").squeeze("-")
|
16
16
|
return unless file
|
17
17
|
|
18
18
|
url = "#{SOURCE}#{file}"
|
data/lib/relaton_w3c.rb
CHANGED
@@ -2,9 +2,6 @@ require "relaton_bib"
|
|
2
2
|
require "relaton_w3c/version"
|
3
3
|
require "relaton_w3c/w3c_bibliography"
|
4
4
|
require "relaton_w3c/w3c_bibliographic_item"
|
5
|
-
# require "relaton_w3c/hit_collection"
|
6
|
-
# require "relaton_w3c/hit"
|
7
|
-
# require "relaton_w3c/scrapper"
|
8
5
|
require "relaton_w3c/xml_parser"
|
9
6
|
require "relaton_w3c/bibxml_parser"
|
10
7
|
require "relaton_w3c/hash_converter"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.11.
|
4
|
+
version: 1.11.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-04-
|
11
|
+
date: 2022-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -247,10 +247,7 @@ files:
|
|
247
247
|
- lib/relaton_w3c/data_index.rb
|
248
248
|
- lib/relaton_w3c/data_parser.rb
|
249
249
|
- lib/relaton_w3c/hash_converter.rb
|
250
|
-
- lib/relaton_w3c/hit.rb
|
251
|
-
- lib/relaton_w3c/hit_collection.rb
|
252
250
|
- lib/relaton_w3c/processor.rb
|
253
|
-
- lib/relaton_w3c/scrapper.rb
|
254
251
|
- lib/relaton_w3c/version.rb
|
255
252
|
- lib/relaton_w3c/w3c_bibliographic_item.rb
|
256
253
|
- lib/relaton_w3c/w3c_bibliography.rb
|
data/lib/relaton_w3c/hit.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module RelatonW3c
|
4
|
-
# Hit.
|
5
|
-
class Hit < RelatonBib::Hit
|
6
|
-
#
|
7
|
-
# Parse page.
|
8
|
-
#
|
9
|
-
# @param lang [String, NilClass]
|
10
|
-
# @return [RelatonW3c::W3cBibliographicItem]
|
11
|
-
def fetch(_lang = nil)
|
12
|
-
@fetch ||= Scrapper.parse_page hit
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,172 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "fileutils"
|
4
|
-
require "yaml"
|
5
|
-
|
6
|
-
module RelatonW3c
|
7
|
-
# Page of hit collection.
|
8
|
-
class HitCollection < RelatonBib::HitCollection
|
9
|
-
TYPES = {
|
10
|
-
"CR" => "Candidate Recommendation",
|
11
|
-
"NOTE" => "Group Note",
|
12
|
-
"PER" => "Proposed Edited Recommendation",
|
13
|
-
"PR" => "Proposed Recommendation",
|
14
|
-
"REC" => "Recommendation",
|
15
|
-
"RET" => "Retired",
|
16
|
-
"WD" => "Working Draft",
|
17
|
-
}.freeze
|
18
|
-
DOMAIN = "https://www.w3.org"
|
19
|
-
DATADIR = File.expand_path(".relaton/w3c", Dir.home).freeze
|
20
|
-
DATAFILE = File.expand_path("bibliography.yml", DATADIR).freeze
|
21
|
-
|
22
|
-
# @param ref [String] reference to search
|
23
|
-
def initialize(ref)
|
24
|
-
%r{
|
25
|
-
^(?:W3C\s)?
|
26
|
-
(?<type>(?:CR|NOTE|PER|PR|REC|RET|WD|Candidate\sRecommendation|
|
27
|
-
Group\sNote|Proposed\sEdited\sRecommendation|Proposed\sRecommendation|
|
28
|
-
Recommendation|Retired|Working\sDraft))? # type
|
29
|
-
\s?
|
30
|
-
(?<title_date>.+) # title_date
|
31
|
-
}x =~ ref
|
32
|
-
super
|
33
|
-
@array = from_yaml title_date, type
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
#
|
39
|
-
# Fetch data form yaml
|
40
|
-
#
|
41
|
-
# @param title_date [String]
|
42
|
-
# @param type [String]
|
43
|
-
# @return [Array<Hash>]
|
44
|
-
def from_yaml(title_date, type) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
45
|
-
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
|
-
title ||= title_date
|
47
|
-
result = data.select do |hit|
|
48
|
-
(hit["title"].casecmp?(title) ||
|
49
|
-
hit["link"].split("/").last.match?(/-#{title}-/)) &&
|
50
|
-
type_date_filter(hit, type, date)
|
51
|
-
end
|
52
|
-
if result.empty?
|
53
|
-
result = data.select { |h| h["link"].split("/").last.match?(/#{title}/) }
|
54
|
-
end
|
55
|
-
result.map { |h| Hit.new(h, self) }
|
56
|
-
end
|
57
|
-
|
58
|
-
# @param hit [Hash]
|
59
|
-
# @param type [String]
|
60
|
-
# @param date [String]
|
61
|
-
# @return [TrueClass, FalseClass]
|
62
|
-
def type_date_filter(hit, type, date) # rubocop:disable Metrics/AbcSize
|
63
|
-
if (type && hit["type"] != short_type(type)) || (date && hit["date"] != date)
|
64
|
-
history = get_history hit, type, date
|
65
|
-
return false unless history.any?
|
66
|
-
|
67
|
-
hit["type"] = short_type type
|
68
|
-
hit["datepub"] = history.first.at("td").text
|
69
|
-
hit["link"] = history.first.at("a")[:href]
|
70
|
-
end
|
71
|
-
true
|
72
|
-
end
|
73
|
-
|
74
|
-
# @param hit [Hash]
|
75
|
-
# @param type [String]
|
76
|
-
# @param date [String]
|
77
|
-
# @return [Array<Nokogiri::XML::Element>, Nokogiri::HTML::NodeSet]
|
78
|
-
def get_history(hit, type, date)
|
79
|
-
resp = Net::HTTP.get URI.parse(HitCollection::DOMAIN + hit["history"])
|
80
|
-
history_doc = Nokogiri::HTML resp
|
81
|
-
history = history_doc.xpath(
|
82
|
-
"//table//a[contains(.,'#{long_type(type)}')]/../..",
|
83
|
-
)
|
84
|
-
return filter_history_by_date(history, history_doc, type, date) if date
|
85
|
-
|
86
|
-
history
|
87
|
-
end
|
88
|
-
|
89
|
-
# @param history [Nokogiri::XML::NodeSet]
|
90
|
-
# @param history_doc [Nokogiri::HTML::NodeSet]
|
91
|
-
# @param type [String]
|
92
|
-
# @param date [String]
|
93
|
-
# @return [Array<Nokogiri::XML::Element>, Nokogiri::HTML::NodeSet]
|
94
|
-
def filter_history_by_date(history, history_doc, type, date)
|
95
|
-
if type
|
96
|
-
history.select do |h|
|
97
|
-
h.at("td[@class='table_datecol']").text == date
|
98
|
-
end
|
99
|
-
else
|
100
|
-
history_doc.xpath(
|
101
|
-
"//table//td[@class='table_datecol'][.='#{date}']/..",
|
102
|
-
)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
#
|
107
|
-
# Convetr long type name to short
|
108
|
-
#
|
109
|
-
# @param type [String]
|
110
|
-
# @return [String]
|
111
|
-
def short_type(type)
|
112
|
-
tp = TYPES.select { |_, v| v == type }.keys
|
113
|
-
tp.first || type
|
114
|
-
end
|
115
|
-
|
116
|
-
#
|
117
|
-
# Convert shot type name to long
|
118
|
-
#
|
119
|
-
# @param [String]
|
120
|
-
# @return [String]
|
121
|
-
def long_type(type)
|
122
|
-
TYPES[type] || type
|
123
|
-
end
|
124
|
-
|
125
|
-
#
|
126
|
-
# Fetches YAML data
|
127
|
-
#
|
128
|
-
# @return [Hash]
|
129
|
-
def data
|
130
|
-
FileUtils.mkdir_p DATADIR
|
131
|
-
ctime = File.ctime DATAFILE if File.exist? DATAFILE
|
132
|
-
fetch_data if !ctime || ctime.to_date < Date.today
|
133
|
-
@data ||= YAML.safe_load File.read(DATAFILE, encoding: "UTF-8")
|
134
|
-
end
|
135
|
-
|
136
|
-
#
|
137
|
-
# fetch data form server and save it to file.
|
138
|
-
#
|
139
|
-
def fetch_data
|
140
|
-
resp = Net::HTTP.get_response URI.parse("#{DOMAIN}/TR/")
|
141
|
-
# return if there aren't any changes since last fetching
|
142
|
-
return unless resp.code == "200"
|
143
|
-
|
144
|
-
doc = Nokogiri::HTML resp.body
|
145
|
-
@data = doc.xpath("//ul[@id='container']/li").map do |h_el|
|
146
|
-
link = h_el.at("h2/a")
|
147
|
-
pubdetails = h_el.at("p[@class='pubdetails']")
|
148
|
-
fetch_hit h_el, link, pubdetails
|
149
|
-
end
|
150
|
-
File.write DATAFILE, @data.to_yaml, encoding: "UTF-8"
|
151
|
-
end
|
152
|
-
|
153
|
-
# @param h_el [Nokogiri::XML::Element]
|
154
|
-
# @param link [Nokogiri::XML::Element]
|
155
|
-
# @param pubdetails [Nokogiri::XML::Element]
|
156
|
-
def fetch_hit(h_el, link, pubdetails) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
|
-
datepub = pubdetails.at("text()").text.match(/\d{4}-\d{2}-\d{2}/).to_s
|
158
|
-
editor = h_el.xpath("ul[@class='editorlist']/li").map { |e| e.text.strip }
|
159
|
-
keyword = h_el.xpath("ul[@class='taglist']/li").map { |e| e.text.strip }
|
160
|
-
{
|
161
|
-
"title" => link.text.gsub("\u00a0", " "),
|
162
|
-
"link" => link[:href],
|
163
|
-
"type" => h_el.at("div").text.upcase,
|
164
|
-
"workgroup" => h_el.xpath("p[@class='deliverer']").map(&:text),
|
165
|
-
"datepub" => datepub,
|
166
|
-
"history" => pubdetails.at("a[text()='History']")[:href],
|
167
|
-
"editor" => editor,
|
168
|
-
"keyword" => keyword,
|
169
|
-
}
|
170
|
-
end
|
171
|
-
end
|
172
|
-
end
|
data/lib/relaton_w3c/scrapper.rb
DELETED
@@ -1,218 +0,0 @@
|
|
1
|
-
module RelatonW3c
|
2
|
-
class Scrapper
|
3
|
-
DOCTYPES = {
|
4
|
-
"CR" => "candidateRecommendation",
|
5
|
-
"NOTE" => "groupNote",
|
6
|
-
"PER" => "proposedEditedRecommendation",
|
7
|
-
"PR" => "proposedRecommendation",
|
8
|
-
"REC" => "recommendation",
|
9
|
-
"RET" => "retired",
|
10
|
-
"WD" => "workingDraft",
|
11
|
-
}.freeze
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# @param hit [Hash]
|
15
|
-
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
|
-
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
|
-
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
|
-
W3cBibliographicItem.new(
|
20
|
-
type: "standard",
|
21
|
-
docid: fetch_docid(hit),
|
22
|
-
fetched: Date.today.to_s,
|
23
|
-
language: ["en"],
|
24
|
-
script: ["Latn"],
|
25
|
-
title: fetch_title(hit, doc),
|
26
|
-
abstract: fetch_abstract(doc),
|
27
|
-
link: fetch_link(hit),
|
28
|
-
date: fetch_date(hit, doc),
|
29
|
-
doctype: fetch_doctype(hit, doc),
|
30
|
-
contributor: fetch_contributor(hit, doc),
|
31
|
-
relation: fetch_relation(doc),
|
32
|
-
keyword: hit["keyword"],
|
33
|
-
)
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
# @param hit [Hash]
|
39
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
40
|
-
def fetch_docid(hit)
|
41
|
-
id = hit["link"].split("/").last
|
42
|
-
[RelatonBib::DocumentIdentifier.new(id: id, type: "W3C", primary: true)]
|
43
|
-
end
|
44
|
-
|
45
|
-
# @param hit [Hash]
|
46
|
-
# @param doc [Nokogiri::HTML::Document]
|
47
|
-
# @return [Array<RelatonBib::TypedTitleString>]
|
48
|
-
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
49
|
-
titles = []
|
50
|
-
if doc
|
51
|
-
title = doc.at("//*[contains(@id, 'title')]")&.text
|
52
|
-
if title && !title.empty?
|
53
|
-
titles << { content: title.gsub(/\n/, " "), type: "main" }
|
54
|
-
end
|
55
|
-
subtitle = doc.at(
|
56
|
-
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]",
|
57
|
-
)&.text
|
58
|
-
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
59
|
-
end
|
60
|
-
if titles.empty? && hit["title"]
|
61
|
-
titles << { content: hit["title"], type: "main" }
|
62
|
-
end
|
63
|
-
titles.map do |t|
|
64
|
-
title = RelatonBib::FormattedString.new(
|
65
|
-
content: t[:content], language: "en", script: "Latn",
|
66
|
-
)
|
67
|
-
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
72
|
-
# @return [Array<RelatonBib::FormattedString>]
|
73
|
-
def fetch_abstract(doc)
|
74
|
-
return [] unless doc
|
75
|
-
|
76
|
-
content = doc.at("//h2[.='Abstract']/following-sibling::p",
|
77
|
-
"//div[@class='abstract']/p").text
|
78
|
-
[RelatonBib::FormattedString.new(content: content, language: "en",
|
79
|
-
script: "Latn")]
|
80
|
-
end
|
81
|
-
|
82
|
-
# @param hit [Hash]
|
83
|
-
# @return [Array<RelatonBib::TypedUri>]
|
84
|
-
def fetch_link(hit)
|
85
|
-
[RelatonBib::TypedUri.new(type: "src", content: hit["link"])]
|
86
|
-
end
|
87
|
-
|
88
|
-
# @param hit [Hash]
|
89
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
90
|
-
# @return [Array<RelatonBib::BibliographicDate>]
|
91
|
-
def fetch_date(hit, doc) # rubocop:disable Metrics/CyclomaticComplexity
|
92
|
-
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
93
|
-
on ||= fetch_date1(doc) || fetch_date2(doc)
|
94
|
-
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
95
|
-
end
|
96
|
-
|
97
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
|
-
# @return [String]
|
99
|
-
def fetch_date1(doc)
|
100
|
-
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
101
|
-
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
102
|
-
end
|
103
|
-
|
104
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
105
|
-
# @return [String]
|
106
|
-
def fetch_date2(doc)
|
107
|
-
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
108
|
-
return unless d
|
109
|
-
|
110
|
-
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
111
|
-
end
|
112
|
-
|
113
|
-
# @param hit [Hash]
|
114
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
115
|
-
# @return [String]
|
116
|
-
def fetch_doctype(hit, doc)
|
117
|
-
if hit["type"]
|
118
|
-
DOCTYPES[hit["type"]]
|
119
|
-
elsif doc
|
120
|
-
type = HitCollection::TYPES.detect do |_k, v|
|
121
|
-
doc.at("//h2[contains(., '#{v}')]/time[@datetime]")
|
122
|
-
end
|
123
|
-
DOCTYPES[type&.first]
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
# @param hit [Hash]
|
128
|
-
# @param doc [Nokogiri::HTML::Document, NilClass]
|
129
|
-
# @return [Array<RelatonBib::ContributionInfo>]
|
130
|
-
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
131
|
-
if doc
|
132
|
-
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
133
|
-
c = parse_contrib ed, "editor"
|
134
|
-
mem << c if c
|
135
|
-
mem
|
136
|
-
end
|
137
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
138
|
-
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
139
|
-
if ed
|
140
|
-
ed[:role] << { type: "author" }
|
141
|
-
else
|
142
|
-
mem << parse_contrib(ath, "author")
|
143
|
-
end
|
144
|
-
mem
|
145
|
-
end
|
146
|
-
contribs.map { |c| contrib_info(**c) }
|
147
|
-
else
|
148
|
-
hit["editor"].map do |ed|
|
149
|
-
contrib_info name: ed, role: [{ type: "editor" }]
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# @param doc [Nokogiri::NTML::Document]
|
155
|
-
# @param type [String]
|
156
|
-
# @return [Array<Nokogiri::XML::Element]
|
157
|
-
def find_contribs(doc, type)
|
158
|
-
doc.xpath("//dt[contains(.,'#{type}')]/following-sibling::dd"\
|
159
|
-
"[preceding-sibling::dt[1][contains(.,'#{type}')]]")
|
160
|
-
end
|
161
|
-
|
162
|
-
# @param element [Nokogiri::XML::Element]
|
163
|
-
# @param type [String]
|
164
|
-
# @return [Hash]
|
165
|
-
def parse_contrib(element, type) # rubocop:disable Metrics/MethodLength
|
166
|
-
p = element.at("a")
|
167
|
-
return unless p
|
168
|
-
|
169
|
-
contrib = {
|
170
|
-
name: p.text,
|
171
|
-
url: p[:href],
|
172
|
-
role: [{ type: type }],
|
173
|
-
id: element["data-editor-id"],
|
174
|
-
}
|
175
|
-
org = element.at("a[2]")
|
176
|
-
contrib[:org] = { name: org.text, url: org[:href] } if org
|
177
|
-
contrib
|
178
|
-
end
|
179
|
-
|
180
|
-
# @param name [String]
|
181
|
-
# @param url [String, NilClass]
|
182
|
-
# @param role [Array<Hash>]
|
183
|
-
# @parma org [Hash]
|
184
|
-
# @return [RelatonBib::ContributionInfo]
|
185
|
-
def contrib_info(**args)
|
186
|
-
completename = RelatonBib::LocalizedString.new(args[:name])
|
187
|
-
name = RelatonBib::FullName.new completename: completename
|
188
|
-
af = []
|
189
|
-
if args[:org]
|
190
|
-
org = RelatonBib::Organization.new(**args[:org])
|
191
|
-
af << RelatonBib::Affiliation.new(organization: org)
|
192
|
-
end
|
193
|
-
en = RelatonBib::Person.new name: name, url: args[:url], affiliation: af
|
194
|
-
RelatonBib::ContributionInfo.new entity: en, role: args[:role]
|
195
|
-
end
|
196
|
-
|
197
|
-
# @param doc [Nokogiri::HTML::Document]
|
198
|
-
# @return [Array<RelatonBib::DocumentRelation>]
|
199
|
-
def fetch_relation(doc)
|
200
|
-
return [] unless doc && (link = recommendation_link(doc))
|
201
|
-
|
202
|
-
hit = { "link" => link }
|
203
|
-
item = parse_page hit
|
204
|
-
[RelatonBib::DocumentRelation.new(type: "obsoletedBy", bibitem: item)]
|
205
|
-
end
|
206
|
-
|
207
|
-
# @param doc [Nokogiri::HTML::Document]
|
208
|
-
# @return [String, NilClass]
|
209
|
-
def recommendation_link(doc)
|
210
|
-
recom = doc.at("//dt[.='Latest Recommendation:']",
|
211
|
-
"//dt[.='Previous Recommendation:']")
|
212
|
-
return unless recom
|
213
|
-
|
214
|
-
recom.at("./following-sibling::dd/a")[:href]
|
215
|
-
end
|
216
|
-
end
|
217
|
-
end
|
218
|
-
end
|