relaton-w3c 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_w3c/scrapper.rb +38 -15
- data/lib/relaton_w3c/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8591c4405cfb771a9cf41e2c74a7d36f21f66e02776389efdf670e639818524a
|
4
|
+
data.tar.gz: be0e0ea3effefd0ab0a48d72cb42eb5da6c62537deaa6dca876d3ee000cc12c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de33be8c4b6a7d9b3ee165454f6988044f5e614239d35e5c9bfa4b8dfc724430f6366dfb90d9bd1c648c78ef1ef7a3701e4ac4461eff9d82be2e2502e588f8bf
|
7
|
+
data.tar.gz: fa7c426893f99844fa702dcbd349bcc141800994e167da25657ea1d5143de442aa5be4be03242221635edf9836d4c1c678e4a522f52c964ddd50f851fdda8d4a
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -13,7 +13,7 @@ module RelatonW3c
|
|
13
13
|
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit)
|
16
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
17
|
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
18
|
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
19
|
W3cBibliographicItem.new(
|
@@ -28,7 +28,7 @@ module RelatonW3c
|
|
28
28
|
doctype: fetch_doctype(hit, doc),
|
29
29
|
contributor: fetch_contributor(hit, doc),
|
30
30
|
relation: fetch_relation(doc),
|
31
|
-
keyword: hit["keyword"]
|
31
|
+
keyword: hit["keyword"]
|
32
32
|
)
|
33
33
|
end
|
34
34
|
|
@@ -37,19 +37,21 @@ module RelatonW3c
|
|
37
37
|
# @param hit [Hash]
|
38
38
|
# @param doc [Nokogiri::HTML::Document]
|
39
39
|
# @return [Array<RelatonBib::TypedTitleString>]
|
40
|
-
def fetch_title(hit, doc)
|
40
|
+
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
41
41
|
titles = []
|
42
42
|
if doc
|
43
|
-
title = doc.at("//h1[@id
|
44
|
-
titles << { content: title, type: "main" }
|
45
|
-
subtitle = doc.at(
|
46
|
-
|
43
|
+
title = doc.at("//h1[contains(@id, 'title')]")&.text
|
44
|
+
titles << { content: title, type: "main" } if title
|
45
|
+
subtitle = doc.at(
|
46
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
47
|
+
)&.text
|
48
|
+
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
47
49
|
elsif hit["title"]
|
48
50
|
titles << { content: hit["title"], type: "main" }
|
49
51
|
end
|
50
52
|
titles.map do |t|
|
51
53
|
title = RelatonBib::FormattedString.new(
|
52
|
-
content: t[:content], language: "en", script: "Latn"
|
54
|
+
content: t[:content], language: "en", script: "Latn"
|
53
55
|
)
|
54
56
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
55
57
|
end
|
@@ -75,10 +77,27 @@ module RelatonW3c
|
|
75
77
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
76
78
|
# @return [Array<RelatonBib::BibliographicDate>]
|
77
79
|
def fetch_date(hit, doc)
|
78
|
-
on = hit["datepub"] || doc
|
80
|
+
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
81
|
+
on ||= fetch_date1(doc) || fetch_date2(doc)
|
79
82
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
80
83
|
end
|
81
84
|
|
85
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
86
|
+
# @return [String]
|
87
|
+
def fetch_date1(doc)
|
88
|
+
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
89
|
+
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
93
|
+
# @return [String]
|
94
|
+
def fetch_date2(doc)
|
95
|
+
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
96
|
+
return unless d
|
97
|
+
|
98
|
+
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
99
|
+
end
|
100
|
+
|
82
101
|
# @param hit [Hash]
|
83
102
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
84
103
|
# @return [String]
|
@@ -96,17 +115,19 @@ module RelatonW3c
|
|
96
115
|
# @param hit [Hash]
|
97
116
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
117
|
# @return [Array<RelatonBib::ContributionInfo>]
|
99
|
-
def fetch_contributor(hit, doc)
|
118
|
+
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
100
119
|
if doc
|
101
|
-
editors = find_contribs(doc, "Editors").
|
102
|
-
parse_contrib ed, "editor"
|
120
|
+
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
121
|
+
c = parse_contrib ed, "editor"
|
122
|
+
mem << c if c
|
123
|
+
mem
|
103
124
|
end
|
104
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem,
|
105
|
-
ed = mem.detect { |e| e[:id] && e[:id] ==
|
125
|
+
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
126
|
+
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
106
127
|
if ed
|
107
128
|
ed[:role] << { type: "author" }
|
108
129
|
else
|
109
|
-
mem << parse_contrib(
|
130
|
+
mem << parse_contrib(ath, "author")
|
110
131
|
end
|
111
132
|
mem
|
112
133
|
end
|
@@ -131,6 +152,8 @@ module RelatonW3c
|
|
131
152
|
# @return [Hash]
|
132
153
|
def parse_contrib(element, type)
|
133
154
|
p = element.at("a")
|
155
|
+
return unless p
|
156
|
+
|
134
157
|
contrib = {
|
135
158
|
name: p.text,
|
136
159
|
url: p[:href],
|
data/lib/relaton_w3c/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -162,7 +162,7 @@ licenses:
|
|
162
162
|
- BSD-2-Clause
|
163
163
|
metadata:
|
164
164
|
homepage_uri: https://github.com/relaton/relaton-wc3
|
165
|
-
post_install_message:
|
165
|
+
post_install_message:
|
166
166
|
rdoc_options: []
|
167
167
|
require_paths:
|
168
168
|
- lib
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubygems_version: 3.0.6
|
181
|
-
signing_key:
|
181
|
+
signing_key:
|
182
182
|
specification_version: 4
|
183
183
|
summary: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
184
184
|
model'
|