relaton-w3c 1.3.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_w3c/scrapper.rb +38 -15
- data/lib/relaton_w3c/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8591c4405cfb771a9cf41e2c74a7d36f21f66e02776389efdf670e639818524a
|
4
|
+
data.tar.gz: be0e0ea3effefd0ab0a48d72cb42eb5da6c62537deaa6dca876d3ee000cc12c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de33be8c4b6a7d9b3ee165454f6988044f5e614239d35e5c9bfa4b8dfc724430f6366dfb90d9bd1c648c78ef1ef7a3701e4ac4461eff9d82be2e2502e588f8bf
|
7
|
+
data.tar.gz: fa7c426893f99844fa702dcbd349bcc141800994e167da25657ea1d5143de442aa5be4be03242221635edf9836d4c1c678e4a522f52c964ddd50f851fdda8d4a
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -13,7 +13,7 @@ module RelatonW3c
|
|
13
13
|
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit)
|
16
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
17
|
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
18
|
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
19
|
W3cBibliographicItem.new(
|
@@ -28,7 +28,7 @@ module RelatonW3c
|
|
28
28
|
doctype: fetch_doctype(hit, doc),
|
29
29
|
contributor: fetch_contributor(hit, doc),
|
30
30
|
relation: fetch_relation(doc),
|
31
|
-
keyword: hit["keyword"]
|
31
|
+
keyword: hit["keyword"]
|
32
32
|
)
|
33
33
|
end
|
34
34
|
|
@@ -37,19 +37,21 @@ module RelatonW3c
|
|
37
37
|
# @param hit [Hash]
|
38
38
|
# @param doc [Nokogiri::HTML::Document]
|
39
39
|
# @return [Array<RelatonBib::TypedTitleString>]
|
40
|
-
def fetch_title(hit, doc)
|
40
|
+
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
41
41
|
titles = []
|
42
42
|
if doc
|
43
|
-
title = doc.at("//h1[@id
|
44
|
-
titles << { content: title, type: "main" }
|
45
|
-
subtitle = doc.at(
|
46
|
-
|
43
|
+
title = doc.at("//h1[contains(@id, 'title')]")&.text
|
44
|
+
titles << { content: title, type: "main" } if title
|
45
|
+
subtitle = doc.at(
|
46
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
47
|
+
)&.text
|
48
|
+
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
47
49
|
elsif hit["title"]
|
48
50
|
titles << { content: hit["title"], type: "main" }
|
49
51
|
end
|
50
52
|
titles.map do |t|
|
51
53
|
title = RelatonBib::FormattedString.new(
|
52
|
-
content: t[:content], language: "en", script: "Latn"
|
54
|
+
content: t[:content], language: "en", script: "Latn"
|
53
55
|
)
|
54
56
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
55
57
|
end
|
@@ -75,10 +77,27 @@ module RelatonW3c
|
|
75
77
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
76
78
|
# @return [Array<RelatonBib::BibliographicDate>]
|
77
79
|
def fetch_date(hit, doc)
|
78
|
-
on = hit["datepub"] || doc
|
80
|
+
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
81
|
+
on ||= fetch_date1(doc) || fetch_date2(doc)
|
79
82
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
80
83
|
end
|
81
84
|
|
85
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
86
|
+
# @return [String]
|
87
|
+
def fetch_date1(doc)
|
88
|
+
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
89
|
+
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
93
|
+
# @return [String]
|
94
|
+
def fetch_date2(doc)
|
95
|
+
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
96
|
+
return unless d
|
97
|
+
|
98
|
+
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
99
|
+
end
|
100
|
+
|
82
101
|
# @param hit [Hash]
|
83
102
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
84
103
|
# @return [String]
|
@@ -96,17 +115,19 @@ module RelatonW3c
|
|
96
115
|
# @param hit [Hash]
|
97
116
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
117
|
# @return [Array<RelatonBib::ContributionInfo>]
|
99
|
-
def fetch_contributor(hit, doc)
|
118
|
+
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
100
119
|
if doc
|
101
|
-
editors = find_contribs(doc, "Editors").
|
102
|
-
parse_contrib ed, "editor"
|
120
|
+
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
121
|
+
c = parse_contrib ed, "editor"
|
122
|
+
mem << c if c
|
123
|
+
mem
|
103
124
|
end
|
104
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem,
|
105
|
-
ed = mem.detect { |e| e[:id] && e[:id] ==
|
125
|
+
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
126
|
+
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
106
127
|
if ed
|
107
128
|
ed[:role] << { type: "author" }
|
108
129
|
else
|
109
|
-
mem << parse_contrib(
|
130
|
+
mem << parse_contrib(ath, "author")
|
110
131
|
end
|
111
132
|
mem
|
112
133
|
end
|
@@ -131,6 +152,8 @@ module RelatonW3c
|
|
131
152
|
# @return [Hash]
|
132
153
|
def parse_contrib(element, type)
|
133
154
|
p = element.at("a")
|
155
|
+
return unless p
|
156
|
+
|
134
157
|
contrib = {
|
135
158
|
name: p.text,
|
136
159
|
url: p[:href],
|
data/lib/relaton_w3c/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -162,7 +162,7 @@ licenses:
|
|
162
162
|
- BSD-2-Clause
|
163
163
|
metadata:
|
164
164
|
homepage_uri: https://github.com/relaton/relaton-wc3
|
165
|
-
post_install_message:
|
165
|
+
post_install_message:
|
166
166
|
rdoc_options: []
|
167
167
|
require_paths:
|
168
168
|
- lib
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubygems_version: 3.0.6
|
181
|
-
signing_key:
|
181
|
+
signing_key:
|
182
182
|
specification_version: 4
|
183
183
|
summary: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
184
184
|
model'
|