relaton-w3c 1.3.0 → 1.5.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +32 -22
- data/lib/relaton_w3c/hit_collection.rb +7 -2
- data/lib/relaton_w3c/scrapper.rb +48 -16
- data/lib/relaton_w3c/version.rb +1 -1
- data/relaton_w3c.gemspec +1 -1
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e722b7477c8308d552d6074c1ff45e0475f4b514972f84d616a979c6cad47de
|
4
|
+
data.tar.gz: 50e24a13d39126eeec22a7dc04c2c8ed51ded7f57d3ab8be88a5c834808f3d15
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4dba20d4305cd58b0542b4ac6f455a87a2fd96d44513ea34a576987df910891e0df7e64eede636f76c9d9499706bac2eaa90baa238bcc527a470bc555ed707b3
|
7
|
+
data.tar.gz: 39cfc5b3e6138b6d209b193cc94fa6fb6b2f56d2ab78284e0b2b950cd08707c8dd3c67d35afbf1a9cc1fa0a7a07cf598065f62a085d26948e11173f59580d40e
|
data/README.adoc
CHANGED
@@ -10,7 +10,7 @@ Add this line to your application's Gemfile:
|
|
10
10
|
|
11
11
|
[source,ruby]
|
12
12
|
----
|
13
|
-
gem '
|
13
|
+
gem 'relaton-w3c'
|
14
14
|
----
|
15
15
|
|
16
16
|
And then execute:
|
@@ -19,14 +19,16 @@ And then execute:
|
|
19
19
|
|
20
20
|
Or install it yourself as:
|
21
21
|
|
22
|
-
$ gem install
|
22
|
+
$ gem install relaton-w3c
|
23
23
|
|
24
24
|
== Usage
|
25
25
|
|
26
26
|
=== Search for a standard using keywords
|
27
27
|
|
28
|
+
[source,ruby]
|
28
29
|
----
|
29
30
|
require 'relaton_w3c'
|
31
|
+
=> true
|
30
32
|
|
31
33
|
hits = RelatonW3c::W3cBibliography.search("W3C JSON-LD 1.1")
|
32
34
|
=> <RelatonW3c::HitCollection:0x007f93b5e4ff48 @ref=W3C JSON-LD 1.1 @fetched=false>
|
@@ -38,40 +40,43 @@ item = hits[0].fetch
|
|
38
40
|
|
39
41
|
=== XML serialization
|
40
42
|
|
43
|
+
[source,ruby]
|
41
44
|
----
|
42
45
|
item.to_xml
|
43
|
-
=> "<bibitem type="standard">
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
46
|
+
=> "<bibitem id="CR-json-ld11-20200316" type="standard">
|
47
|
+
<fetched>2020-04-07</fetched>
|
48
|
+
<title type="main" format="text/plain" language="en" script="Latn">JSON-LD 1.1</title>
|
49
|
+
<title format="text/plain" language="en" script="Latn">A JSON-based Serialization for Linked Data</title>
|
50
|
+
<uri type="src">https://www.w3.org/TR/2020/CR-json-ld11-20200316/</uri>
|
51
|
+
<date type="published">
|
52
|
+
<on>2020</on>
|
53
|
+
</date>
|
54
|
+
...
|
52
55
|
</bibitem>"
|
53
56
|
----
|
54
57
|
|
55
58
|
With argument `bibdata: true` it ouputs XML wrapped by `bibdata` element and adds flavour `ext` element.
|
56
59
|
|
60
|
+
[source,ruby]
|
57
61
|
----
|
58
62
|
item.to_xml bibdata: true
|
59
63
|
=> "<bibdata type="standard">
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
<fetched>2020-04-07</fetched>
|
65
|
+
<title type="main" format="text/plain" language="en" script="Latn">JSON-LD 1.1</title>
|
66
|
+
<title format="text/plain" language="en" script="Latn">A JSON-based Serialization for Linked Data</title>
|
67
|
+
<uri type="src">https://www.w3.org/TR/2020/CR-json-ld11-20200316/</uri>
|
68
|
+
<date type="published">
|
69
|
+
<on>2020</on>
|
70
|
+
</date>
|
71
|
+
...
|
72
|
+
<ext>
|
73
|
+
<doctype>candidateRecommendation</doctype>
|
74
|
+
</ext>
|
71
75
|
</bibdata>"
|
72
76
|
----
|
73
77
|
|
74
78
|
=== Get document by title
|
79
|
+
[source,ruby]
|
75
80
|
----
|
76
81
|
RelatonW3c::W3cBibliography.get "W3C JSON-LD 1.1"
|
77
82
|
[relaton-w3c] ("W3C JSON-LD 1.1") fetching...
|
@@ -81,6 +86,7 @@ RelatonW3c::W3cBibliography.get "W3C JSON-LD 1.1"
|
|
81
86
|
----
|
82
87
|
|
83
88
|
=== Get document by title and type
|
89
|
+
[source,ruby]
|
84
90
|
----
|
85
91
|
RelatonW3c::W3cBibliography.get "W3C Candidate Recommendation JSON-LD 1.1"
|
86
92
|
[relaton-w3c] ("W3C Candidate Recommendation JSON-LD 1.1") fetching...
|
@@ -90,6 +96,7 @@ RelatonW3c::W3cBibliography.get "W3C Candidate Recommendation JSON-LD 1.1"
|
|
90
96
|
----
|
91
97
|
|
92
98
|
=== Get document by title and short type
|
99
|
+
[source,ruby]
|
93
100
|
----
|
94
101
|
RelatonW3c::W3cBibliography.get "W3C CR JSON-LD 1.1"
|
95
102
|
[relaton-w3c] ("W3C CR JSON-LD 1.1") fetching...
|
@@ -99,6 +106,7 @@ RelatonW3c::W3cBibliography.get "W3C CR JSON-LD 1.1"
|
|
99
106
|
----
|
100
107
|
|
101
108
|
=== Get document by title, type, and date
|
109
|
+
[source,ruby]
|
102
110
|
----
|
103
111
|
RelatonW3c::W3cBibliography.get "W3C WD JSON-LD 1.1 2019-10-18"
|
104
112
|
[relaton-w3c] ("W3C WD JSON-LD 1.1 2019-10-18") fetching...
|
@@ -108,6 +116,7 @@ RelatonW3c::W3cBibliography.get "W3C WD JSON-LD 1.1 2019-10-18"
|
|
108
116
|
----
|
109
117
|
|
110
118
|
=== Create bibliographic item from XML
|
119
|
+
[source,ruby]
|
111
120
|
----
|
112
121
|
RelatonW3c::XMLParser.from_xml File.read('spec/fixtures/cr_json_ld11.xml')
|
113
122
|
=> #<RelatonW3c::W3cBibliographicItem:0x007f9381efce98
|
@@ -115,6 +124,7 @@ RelatonW3c::XMLParser.from_xml File.read('spec/fixtures/cr_json_ld11.xml')
|
|
115
124
|
----
|
116
125
|
|
117
126
|
=== Create bibliographic item from YAML
|
127
|
+
[source,ruby]
|
118
128
|
----
|
119
129
|
hash = YAML.load_file 'spec/fixtures/cr_json_ld11.yml'
|
120
130
|
=> {"title"=>
|
@@ -45,7 +45,12 @@ module RelatonW3c
|
|
45
45
|
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
46
|
title ||= title_date
|
47
47
|
result = data.select do |hit|
|
48
|
-
hit["title"]
|
48
|
+
(hit["title"].casecmp?(title) ||
|
49
|
+
hit["link"].split("/").last.match?(/-#{title}-/)) &&
|
50
|
+
type_date_filter(hit, type, date)
|
51
|
+
end
|
52
|
+
if result.empty?
|
53
|
+
result = data.select { |h| h["link"].split("/").last.match? /#{title}/ }
|
49
54
|
end
|
50
55
|
result.map { |h| Hit.new(h, self) }
|
51
56
|
end
|
@@ -54,7 +59,7 @@ module RelatonW3c
|
|
54
59
|
# @param type [String]
|
55
60
|
# @param date [String]
|
56
61
|
# @return [TrueClass, FalseClass]
|
57
|
-
def type_date_filter(hit, type, date)
|
62
|
+
def type_date_filter(hit, type, date) # rubocop:disable Metrics/AbcSize
|
58
63
|
if type && hit["type"] != short_type(type) || date && hit["date"] != date
|
59
64
|
history = get_history hit, type, date
|
60
65
|
return false unless history.any?
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -13,11 +13,12 @@ module RelatonW3c
|
|
13
13
|
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit)
|
16
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
17
|
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
18
|
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
19
|
W3cBibliographicItem.new(
|
20
20
|
type: "standard",
|
21
|
+
docid: fetch_docid(hit),
|
21
22
|
fetched: Date.today.to_s,
|
22
23
|
language: ["en"],
|
23
24
|
script: ["Latn"],
|
@@ -28,28 +29,38 @@ module RelatonW3c
|
|
28
29
|
doctype: fetch_doctype(hit, doc),
|
29
30
|
contributor: fetch_contributor(hit, doc),
|
30
31
|
relation: fetch_relation(doc),
|
31
|
-
keyword: hit["keyword"]
|
32
|
+
keyword: hit["keyword"]
|
32
33
|
)
|
33
34
|
end
|
34
35
|
|
35
36
|
private
|
36
37
|
|
38
|
+
# @param hit [Hash]
|
39
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
40
|
+
def fetch_docid(hit)
|
41
|
+
id = hit["link"].split("/").last
|
42
|
+
[RelatonBib::DocumentIdentifier.new(id: id, type: "W3C")]
|
43
|
+
end
|
44
|
+
|
37
45
|
# @param hit [Hash]
|
38
46
|
# @param doc [Nokogiri::HTML::Document]
|
39
47
|
# @return [Array<RelatonBib::TypedTitleString>]
|
40
|
-
def fetch_title(hit, doc)
|
48
|
+
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
41
49
|
titles = []
|
42
50
|
if doc
|
43
|
-
title = doc.at("
|
44
|
-
titles << { content: title, type: "main" }
|
45
|
-
subtitle = doc.at(
|
46
|
-
|
47
|
-
|
51
|
+
title = doc.at("//*[contains(@id, 'title')]")&.text
|
52
|
+
titles << { content: title, type: "main" } if title && !title.empty?
|
53
|
+
subtitle = doc.at(
|
54
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
55
|
+
)&.text
|
56
|
+
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
57
|
+
end
|
58
|
+
if titles.empty? && hit["title"]
|
48
59
|
titles << { content: hit["title"], type: "main" }
|
49
60
|
end
|
50
61
|
titles.map do |t|
|
51
62
|
title = RelatonBib::FormattedString.new(
|
52
|
-
content: t[:content], language: "en", script: "Latn"
|
63
|
+
content: t[:content], language: "en", script: "Latn"
|
53
64
|
)
|
54
65
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
55
66
|
end
|
@@ -75,10 +86,27 @@ module RelatonW3c
|
|
75
86
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
76
87
|
# @return [Array<RelatonBib::BibliographicDate>]
|
77
88
|
def fetch_date(hit, doc)
|
78
|
-
on = hit["datepub"] || doc
|
89
|
+
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
90
|
+
on ||= fetch_date1(doc) || fetch_date2(doc)
|
79
91
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
80
92
|
end
|
81
93
|
|
94
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
95
|
+
# @return [String]
|
96
|
+
def fetch_date1(doc)
|
97
|
+
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
98
|
+
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
99
|
+
end
|
100
|
+
|
101
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
102
|
+
# @return [String]
|
103
|
+
def fetch_date2(doc)
|
104
|
+
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
105
|
+
return unless d
|
106
|
+
|
107
|
+
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
108
|
+
end
|
109
|
+
|
82
110
|
# @param hit [Hash]
|
83
111
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
84
112
|
# @return [String]
|
@@ -96,17 +124,19 @@ module RelatonW3c
|
|
96
124
|
# @param hit [Hash]
|
97
125
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
126
|
# @return [Array<RelatonBib::ContributionInfo>]
|
99
|
-
def fetch_contributor(hit, doc)
|
127
|
+
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
100
128
|
if doc
|
101
|
-
editors = find_contribs(doc, "Editors").
|
102
|
-
parse_contrib ed, "editor"
|
129
|
+
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
130
|
+
c = parse_contrib ed, "editor"
|
131
|
+
mem << c if c
|
132
|
+
mem
|
103
133
|
end
|
104
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem,
|
105
|
-
ed = mem.detect { |e| e[:id] && e[:id] ==
|
134
|
+
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
135
|
+
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
106
136
|
if ed
|
107
137
|
ed[:role] << { type: "author" }
|
108
138
|
else
|
109
|
-
mem << parse_contrib(
|
139
|
+
mem << parse_contrib(ath, "author")
|
110
140
|
end
|
111
141
|
mem
|
112
142
|
end
|
@@ -131,6 +161,8 @@ module RelatonW3c
|
|
131
161
|
# @return [Hash]
|
132
162
|
def parse_contrib(element, type)
|
133
163
|
p = element.at("a")
|
164
|
+
return unless p
|
165
|
+
|
134
166
|
contrib = {
|
135
167
|
name: p.text,
|
136
168
|
url: p[:href],
|
data/lib/relaton_w3c/version.rb
CHANGED
data/relaton_w3c.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.pre
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - ">="
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.
|
117
|
+
version: 1.5.pre
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.
|
124
|
+
version: 1.5.pre
|
125
125
|
description: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
126
126
|
model'
|
127
127
|
email:
|
@@ -162,7 +162,7 @@ licenses:
|
|
162
162
|
- BSD-2-Clause
|
163
163
|
metadata:
|
164
164
|
homepage_uri: https://github.com/relaton/relaton-wc3
|
165
|
-
post_install_message:
|
165
|
+
post_install_message:
|
166
166
|
rdoc_options: []
|
167
167
|
require_paths:
|
168
168
|
- lib
|
@@ -173,12 +173,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
173
173
|
version: 2.4.0
|
174
174
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
175
175
|
requirements:
|
176
|
-
- - "
|
176
|
+
- - ">"
|
177
177
|
- !ruby/object:Gem::Version
|
178
|
-
version:
|
178
|
+
version: 1.3.1
|
179
179
|
requirements: []
|
180
180
|
rubygems_version: 3.0.6
|
181
|
-
signing_key:
|
181
|
+
signing_key:
|
182
182
|
specification_version: 4
|
183
183
|
summary: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
184
184
|
model'
|