relaton-w3c 1.0.2 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ubuntu.yml +1 -0
- data/.rubocop.yml +2 -2
- data/grammars/biblio.rng +36 -6
- data/lib/relaton_w3c/hash_converter.rb +7 -0
- data/lib/relaton_w3c/hit_collection.rb +4 -2
- data/lib/relaton_w3c/scrapper.rb +46 -15
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliographic_item.rb +0 -23
- data/lib/relaton_w3c/xml_parser.rb +6 -13
- data/relaton_w3c.gemspec +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7dd4430200f1bc42ef19c5e7efd4eb0bee2167d17c80ca44533f0b08e6bdfe6
|
4
|
+
data.tar.gz: 62b4b98f90f7541d8099647f287e94cbe62600ef5943bcd7e315b7d8990f8424
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5480d5eb9a41ef48c7e4a9237f793e96016c32ed3b1cd982b784f4e3e8f0ba84de136b13993ef7a5d6d2fda92540cf5be98339fd198bc93e5486e572958a5feb
|
7
|
+
data.tar.gz: 12de49efbd61c4417ba7d31062f63560883cbe46f5ca9405218e84cb7d815f2717ea691cfd83175aff5b072cc44f7c3284fea526991d9395e9c47789d283b593
|
data/.rubocop.yml
CHANGED
data/grammars/biblio.rng
CHANGED
@@ -88,7 +88,7 @@
|
|
88
88
|
<text/>
|
89
89
|
</element>
|
90
90
|
</define>
|
91
|
-
<define name="
|
91
|
+
<define name="LocalizedString1">
|
92
92
|
<optional>
|
93
93
|
<!-- multiple languages and scripts possible: comma delimit them if so -->
|
94
94
|
<attribute name="language"/>
|
@@ -98,6 +98,16 @@
|
|
98
98
|
</optional>
|
99
99
|
<text/>
|
100
100
|
</define>
|
101
|
+
<define name="LocalizedString">
|
102
|
+
<choice>
|
103
|
+
<ref name="LocalizedString1"/>
|
104
|
+
<oneOrMore>
|
105
|
+
<element name="variant">
|
106
|
+
<ref name="LocalizedString1"/>
|
107
|
+
</element>
|
108
|
+
</oneOrMore>
|
109
|
+
</choice>
|
110
|
+
</define>
|
101
111
|
<!--
|
102
112
|
Unlike UML, change type to format: type is overloaded
|
103
113
|
Would be need if plain were default value and could omit the attribute
|
@@ -121,7 +131,7 @@
|
|
121
131
|
</optional>
|
122
132
|
<ref name="LocalizedStringOrXsAny"/>
|
123
133
|
</define>
|
124
|
-
<define name="
|
134
|
+
<define name="LocalizedStringOrXsAny1">
|
125
135
|
<optional>
|
126
136
|
<!-- multiple languages and scripts possible: comma delimit them if so -->
|
127
137
|
<attribute name="language"/>
|
@@ -136,6 +146,16 @@
|
|
136
146
|
</choice>
|
137
147
|
</oneOrMore>
|
138
148
|
</define>
|
149
|
+
<define name="LocalizedStringOrXsAny">
|
150
|
+
<choice>
|
151
|
+
<ref name="LocalizedStringOrXsAny1"/>
|
152
|
+
<oneOrMore>
|
153
|
+
<element name="variant">
|
154
|
+
<ref name="LocalizedStringOrXsAny1"/>
|
155
|
+
</element>
|
156
|
+
</oneOrMore>
|
157
|
+
</choice>
|
158
|
+
</define>
|
139
159
|
<define name="contributor">
|
140
160
|
<element name="contributor">
|
141
161
|
<zeroOrMore>
|
@@ -512,7 +532,7 @@
|
|
512
532
|
</define>
|
513
533
|
<define name="LocalityType">
|
514
534
|
<data type="string">
|
515
|
-
<param name="pattern">section|clause|part|paragraph|chapter|page|whole|table|annex|figure|note|list|example|volume|issue|time|locality:[a-zA-Z0-9_]+</param>
|
535
|
+
<param name="pattern">section|clause|part|paragraph|chapter|page|whole|table|annex|figure|note|list|example|volume|issue|time|anchor|locality:[a-zA-Z0-9_]+</param>
|
516
536
|
</data>
|
517
537
|
</define>
|
518
538
|
<define name="referenceFrom">
|
@@ -641,9 +661,9 @@
|
|
641
661
|
<optional>
|
642
662
|
<ref name="status"/>
|
643
663
|
</optional>
|
644
|
-
<
|
664
|
+
<zeroOrMore>
|
645
665
|
<ref name="copyright"/>
|
646
|
-
</
|
666
|
+
</zeroOrMore>
|
647
667
|
<zeroOrMore>
|
648
668
|
<ref name="docrelation"/>
|
649
669
|
</zeroOrMore>
|
@@ -1001,7 +1021,17 @@
|
|
1001
1021
|
<optional>
|
1002
1022
|
<ref name="to"/>
|
1003
1023
|
</optional>
|
1004
|
-
<
|
1024
|
+
<oneOrMore>
|
1025
|
+
<ref name="owner"/>
|
1026
|
+
</oneOrMore>
|
1027
|
+
<optional>
|
1028
|
+
<ref name="copyright_scope"/>
|
1029
|
+
</optional>
|
1030
|
+
</element>
|
1031
|
+
</define>
|
1032
|
+
<define name="copyright_scope">
|
1033
|
+
<element name="scope">
|
1034
|
+
<text/>
|
1005
1035
|
</element>
|
1006
1036
|
</define>
|
1007
1037
|
<define name="from">
|
@@ -45,7 +45,9 @@ module RelatonW3c
|
|
45
45
|
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
46
|
title ||= title_date
|
47
47
|
result = data.select do |hit|
|
48
|
-
hit["title"]
|
48
|
+
(hit["title"].casecmp?(title) ||
|
49
|
+
hit["link"].split("/").last.match?(/#{title}/)) &&
|
50
|
+
type_date_filter(hit, type, date)
|
49
51
|
end
|
50
52
|
result.map { |h| Hit.new(h, self) }
|
51
53
|
end
|
@@ -54,7 +56,7 @@ module RelatonW3c
|
|
54
56
|
# @param type [String]
|
55
57
|
# @param date [String]
|
56
58
|
# @return [TrueClass, FalseClass]
|
57
|
-
def type_date_filter(hit, type, date)
|
59
|
+
def type_date_filter(hit, type, date) # rubocop:disable Metrics/AbcSize
|
58
60
|
if type && hit["type"] != short_type(type) || date && hit["date"] != date
|
59
61
|
history = get_history hit, type, date
|
60
62
|
return false unless history.any?
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -13,11 +13,12 @@ module RelatonW3c
|
|
13
13
|
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit)
|
16
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
17
|
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
18
|
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
19
|
W3cBibliographicItem.new(
|
20
20
|
type: "standard",
|
21
|
+
docid: fetch_docid(hit),
|
21
22
|
fetched: Date.today.to_s,
|
22
23
|
language: ["en"],
|
23
24
|
script: ["Latn"],
|
@@ -28,28 +29,37 @@ module RelatonW3c
|
|
28
29
|
doctype: fetch_doctype(hit, doc),
|
29
30
|
contributor: fetch_contributor(hit, doc),
|
30
31
|
relation: fetch_relation(doc),
|
31
|
-
keyword: hit["keyword"]
|
32
|
+
keyword: hit["keyword"]
|
32
33
|
)
|
33
34
|
end
|
34
35
|
|
35
36
|
private
|
36
37
|
|
38
|
+
# @param hit [Hash]
|
39
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
40
|
+
def fetch_docid(hit)
|
41
|
+
id = hit["link"].split("/").last
|
42
|
+
[RelatonBib::DocumentIdentifier.new(id: id, type: "W3C")]
|
43
|
+
end
|
44
|
+
|
37
45
|
# @param hit [Hash]
|
38
46
|
# @param doc [Nokogiri::HTML::Document]
|
39
47
|
# @return [Array<RelatonBib::TypedTitleString>]
|
40
|
-
def fetch_title(hit, doc)
|
48
|
+
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
41
49
|
titles = []
|
42
50
|
if doc
|
43
|
-
title = doc.at("//h1[@id
|
44
|
-
titles << { content: title, type: "main" }
|
45
|
-
subtitle = doc.at(
|
46
|
-
|
51
|
+
title = doc.at("//h1[contains(@id, 'title')]")&.text
|
52
|
+
titles << { content: title, type: "main" } if title
|
53
|
+
subtitle = doc.at(
|
54
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
55
|
+
)&.text
|
56
|
+
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
47
57
|
elsif hit["title"]
|
48
58
|
titles << { content: hit["title"], type: "main" }
|
49
59
|
end
|
50
60
|
titles.map do |t|
|
51
61
|
title = RelatonBib::FormattedString.new(
|
52
|
-
content: t[:content], language: "en", script: "Latn"
|
62
|
+
content: t[:content], language: "en", script: "Latn"
|
53
63
|
)
|
54
64
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
55
65
|
end
|
@@ -75,10 +85,27 @@ module RelatonW3c
|
|
75
85
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
76
86
|
# @return [Array<RelatonBib::BibliographicDate>]
|
77
87
|
def fetch_date(hit, doc)
|
78
|
-
on = hit["datepub"] || doc
|
88
|
+
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
89
|
+
on ||= fetch_date1(doc) || fetch_date2(doc)
|
79
90
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
80
91
|
end
|
81
92
|
|
93
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
94
|
+
# @return [String]
|
95
|
+
def fetch_date1(doc)
|
96
|
+
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
97
|
+
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
98
|
+
end
|
99
|
+
|
100
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
101
|
+
# @return [String]
|
102
|
+
def fetch_date2(doc)
|
103
|
+
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
104
|
+
return unless d
|
105
|
+
|
106
|
+
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
107
|
+
end
|
108
|
+
|
82
109
|
# @param hit [Hash]
|
83
110
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
84
111
|
# @return [String]
|
@@ -96,17 +123,19 @@ module RelatonW3c
|
|
96
123
|
# @param hit [Hash]
|
97
124
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
125
|
# @return [Array<RelatonBib::ContributionInfo>]
|
99
|
-
def fetch_contributor(hit, doc)
|
126
|
+
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
100
127
|
if doc
|
101
|
-
editors = find_contribs(doc, "Editors").
|
102
|
-
parse_contrib ed, "editor"
|
128
|
+
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
129
|
+
c = parse_contrib ed, "editor"
|
130
|
+
mem << c if c
|
131
|
+
mem
|
103
132
|
end
|
104
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem,
|
105
|
-
ed = mem.detect { |e| e[:id] && e[:id] ==
|
133
|
+
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
134
|
+
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
106
135
|
if ed
|
107
136
|
ed[:role] << { type: "author" }
|
108
137
|
else
|
109
|
-
mem << parse_contrib(
|
138
|
+
mem << parse_contrib(ath, "author")
|
110
139
|
end
|
111
140
|
mem
|
112
141
|
end
|
@@ -131,6 +160,8 @@ module RelatonW3c
|
|
131
160
|
# @return [Hash]
|
132
161
|
def parse_contrib(element, type)
|
133
162
|
p = element.at("a")
|
163
|
+
return unless p
|
164
|
+
|
134
165
|
contrib = {
|
135
166
|
name: p.text,
|
136
167
|
url: p[:href],
|
data/lib/relaton_w3c/version.rb
CHANGED
@@ -5,35 +5,12 @@ module RelatonW3c
|
|
5
5
|
proposedRecommendation recommendation retired workingDraft
|
6
6
|
].freeze
|
7
7
|
|
8
|
-
attr_reader :doctype
|
9
|
-
|
10
8
|
# @param doctype [String]
|
11
9
|
def initialize(**args)
|
12
10
|
if args[:doctype] && !TYPES.include?(args[:doctype])
|
13
11
|
warn "[relaton-w3c] invalid document type: #{args[:doctype]}"
|
14
12
|
end
|
15
|
-
@doctype = args.delete :doctype
|
16
13
|
super **args
|
17
14
|
end
|
18
|
-
|
19
|
-
# @param builder [Nokogiri::XML::Builder, NilClass]
|
20
|
-
# @param opts [Hash]
|
21
|
-
# @option opts [TrueClass, FalseClass, NilClass] bibdata
|
22
|
-
def to_xml(builder = nil, **opts)
|
23
|
-
super builder, **opts do |b|
|
24
|
-
if opts[:bibdata] && doctype
|
25
|
-
b.ext do |e|
|
26
|
-
e.doctype doctype if doctype
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# @return [Hash]
|
33
|
-
def to_hash
|
34
|
-
hash = super
|
35
|
-
hash["doctype"] = doctype if doctype
|
36
|
-
hash
|
37
|
-
end
|
38
15
|
end
|
39
16
|
end
|
@@ -1,19 +1,6 @@
|
|
1
1
|
module RelatonW3c
|
2
2
|
class XMLParser < RelatonBib::XMLParser
|
3
3
|
class << self
|
4
|
-
# @param xml [String]
|
5
|
-
# @return [RelatonW3c::W3cBibliographicItem, NilClass]
|
6
|
-
def from_xml(xml)
|
7
|
-
doc = Nokogiri::XML xml
|
8
|
-
doc.remove_namespaces!
|
9
|
-
item = doc.at("/bibitem|/bibdata")
|
10
|
-
if item
|
11
|
-
W3cBibliographicItem.new(item_data(item))
|
12
|
-
else
|
13
|
-
warn "[relaton-w3c] can't find bibitem or bibdata element in the XML"
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
4
|
private
|
18
5
|
|
19
6
|
# Override RelatonBib::XMLParser.item_data method.
|
@@ -27,6 +14,12 @@ module RelatonW3c
|
|
27
14
|
data[:doctype] = ext.at("./doctype")&.text
|
28
15
|
data
|
29
16
|
end
|
17
|
+
|
18
|
+
# @param item_hash [Hash]
|
19
|
+
# @return [RelatonBib::BibliographicItem]
|
20
|
+
def bib_item(item_hash)
|
21
|
+
W3cBibliographicItem.new item_hash
|
22
|
+
end
|
30
23
|
end
|
31
24
|
end
|
32
25
|
end
|
data/relaton_w3c.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - ">="
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.0
|
117
|
+
version: 1.3.0
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.0
|
124
|
+
version: 1.3.0
|
125
125
|
description: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
126
126
|
model'
|
127
127
|
email:
|
@@ -162,7 +162,7 @@ licenses:
|
|
162
162
|
- BSD-2-Clause
|
163
163
|
metadata:
|
164
164
|
homepage_uri: https://github.com/relaton/relaton-wc3
|
165
|
-
post_install_message:
|
165
|
+
post_install_message:
|
166
166
|
rdoc_options: []
|
167
167
|
require_paths:
|
168
168
|
- lib
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubygems_version: 3.0.6
|
181
|
-
signing_key:
|
181
|
+
signing_key:
|
182
182
|
specification_version: 4
|
183
183
|
summary: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
184
184
|
model'
|