relaton-w3c 1.0.2 → 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ubuntu.yml +1 -0
- data/.rubocop.yml +2 -2
- data/grammars/biblio.rng +36 -6
- data/lib/relaton_w3c/hash_converter.rb +7 -0
- data/lib/relaton_w3c/hit_collection.rb +4 -2
- data/lib/relaton_w3c/scrapper.rb +46 -15
- data/lib/relaton_w3c/version.rb +1 -1
- data/lib/relaton_w3c/w3c_bibliographic_item.rb +0 -23
- data/lib/relaton_w3c/xml_parser.rb +6 -13
- data/relaton_w3c.gemspec +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7dd4430200f1bc42ef19c5e7efd4eb0bee2167d17c80ca44533f0b08e6bdfe6
|
4
|
+
data.tar.gz: 62b4b98f90f7541d8099647f287e94cbe62600ef5943bcd7e315b7d8990f8424
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5480d5eb9a41ef48c7e4a9237f793e96016c32ed3b1cd982b784f4e3e8f0ba84de136b13993ef7a5d6d2fda92540cf5be98339fd198bc93e5486e572958a5feb
|
7
|
+
data.tar.gz: 12de49efbd61c4417ba7d31062f63560883cbe46f5ca9405218e84cb7d815f2717ea691cfd83175aff5b072cc44f7c3284fea526991d9395e9c47789d283b593
|
data/.rubocop.yml
CHANGED
data/grammars/biblio.rng
CHANGED
@@ -88,7 +88,7 @@
|
|
88
88
|
<text/>
|
89
89
|
</element>
|
90
90
|
</define>
|
91
|
-
<define name="
|
91
|
+
<define name="LocalizedString1">
|
92
92
|
<optional>
|
93
93
|
<!-- multiple languages and scripts possible: comma delimit them if so -->
|
94
94
|
<attribute name="language"/>
|
@@ -98,6 +98,16 @@
|
|
98
98
|
</optional>
|
99
99
|
<text/>
|
100
100
|
</define>
|
101
|
+
<define name="LocalizedString">
|
102
|
+
<choice>
|
103
|
+
<ref name="LocalizedString1"/>
|
104
|
+
<oneOrMore>
|
105
|
+
<element name="variant">
|
106
|
+
<ref name="LocalizedString1"/>
|
107
|
+
</element>
|
108
|
+
</oneOrMore>
|
109
|
+
</choice>
|
110
|
+
</define>
|
101
111
|
<!--
|
102
112
|
Unlike UML, change type to format: type is overloaded
|
103
113
|
Would be need if plain were default value and could omit the attribute
|
@@ -121,7 +131,7 @@
|
|
121
131
|
</optional>
|
122
132
|
<ref name="LocalizedStringOrXsAny"/>
|
123
133
|
</define>
|
124
|
-
<define name="
|
134
|
+
<define name="LocalizedStringOrXsAny1">
|
125
135
|
<optional>
|
126
136
|
<!-- multiple languages and scripts possible: comma delimit them if so -->
|
127
137
|
<attribute name="language"/>
|
@@ -136,6 +146,16 @@
|
|
136
146
|
</choice>
|
137
147
|
</oneOrMore>
|
138
148
|
</define>
|
149
|
+
<define name="LocalizedStringOrXsAny">
|
150
|
+
<choice>
|
151
|
+
<ref name="LocalizedStringOrXsAny1"/>
|
152
|
+
<oneOrMore>
|
153
|
+
<element name="variant">
|
154
|
+
<ref name="LocalizedStringOrXsAny1"/>
|
155
|
+
</element>
|
156
|
+
</oneOrMore>
|
157
|
+
</choice>
|
158
|
+
</define>
|
139
159
|
<define name="contributor">
|
140
160
|
<element name="contributor">
|
141
161
|
<zeroOrMore>
|
@@ -512,7 +532,7 @@
|
|
512
532
|
</define>
|
513
533
|
<define name="LocalityType">
|
514
534
|
<data type="string">
|
515
|
-
<param name="pattern">section|clause|part|paragraph|chapter|page|whole|table|annex|figure|note|list|example|volume|issue|time|locality:[a-zA-Z0-9_]+</param>
|
535
|
+
<param name="pattern">section|clause|part|paragraph|chapter|page|whole|table|annex|figure|note|list|example|volume|issue|time|anchor|locality:[a-zA-Z0-9_]+</param>
|
516
536
|
</data>
|
517
537
|
</define>
|
518
538
|
<define name="referenceFrom">
|
@@ -641,9 +661,9 @@
|
|
641
661
|
<optional>
|
642
662
|
<ref name="status"/>
|
643
663
|
</optional>
|
644
|
-
<
|
664
|
+
<zeroOrMore>
|
645
665
|
<ref name="copyright"/>
|
646
|
-
</
|
666
|
+
</zeroOrMore>
|
647
667
|
<zeroOrMore>
|
648
668
|
<ref name="docrelation"/>
|
649
669
|
</zeroOrMore>
|
@@ -1001,7 +1021,17 @@
|
|
1001
1021
|
<optional>
|
1002
1022
|
<ref name="to"/>
|
1003
1023
|
</optional>
|
1004
|
-
<
|
1024
|
+
<oneOrMore>
|
1025
|
+
<ref name="owner"/>
|
1026
|
+
</oneOrMore>
|
1027
|
+
<optional>
|
1028
|
+
<ref name="copyright_scope"/>
|
1029
|
+
</optional>
|
1030
|
+
</element>
|
1031
|
+
</define>
|
1032
|
+
<define name="copyright_scope">
|
1033
|
+
<element name="scope">
|
1034
|
+
<text/>
|
1005
1035
|
</element>
|
1006
1036
|
</define>
|
1007
1037
|
<define name="from">
|
@@ -45,7 +45,9 @@ module RelatonW3c
|
|
45
45
|
/(?<title>.+)\s(?<date>\d{4}-\d{2}-\d{2})$/ =~ title_date
|
46
46
|
title ||= title_date
|
47
47
|
result = data.select do |hit|
|
48
|
-
hit["title"]
|
48
|
+
(hit["title"].casecmp?(title) ||
|
49
|
+
hit["link"].split("/").last.match?(/#{title}/)) &&
|
50
|
+
type_date_filter(hit, type, date)
|
49
51
|
end
|
50
52
|
result.map { |h| Hit.new(h, self) }
|
51
53
|
end
|
@@ -54,7 +56,7 @@ module RelatonW3c
|
|
54
56
|
# @param type [String]
|
55
57
|
# @param date [String]
|
56
58
|
# @return [TrueClass, FalseClass]
|
57
|
-
def type_date_filter(hit, type, date)
|
59
|
+
def type_date_filter(hit, type, date) # rubocop:disable Metrics/AbcSize
|
58
60
|
if type && hit["type"] != short_type(type) || date && hit["date"] != date
|
59
61
|
history = get_history hit, type, date
|
60
62
|
return false unless history.any?
|
data/lib/relaton_w3c/scrapper.rb
CHANGED
@@ -13,11 +13,12 @@ module RelatonW3c
|
|
13
13
|
|
14
14
|
# @param hit [Hash]
|
15
15
|
# @return [RelatonW3c::W3cBibliographicItem]
|
16
|
-
def parse_page(hit)
|
16
|
+
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
17
|
resp = Net::HTTP.get_response URI.parse(hit["link"])
|
18
18
|
doc = resp.code == "200" ? Nokogiri::HTML(resp.body) : nil
|
19
19
|
W3cBibliographicItem.new(
|
20
20
|
type: "standard",
|
21
|
+
docid: fetch_docid(hit),
|
21
22
|
fetched: Date.today.to_s,
|
22
23
|
language: ["en"],
|
23
24
|
script: ["Latn"],
|
@@ -28,28 +29,37 @@ module RelatonW3c
|
|
28
29
|
doctype: fetch_doctype(hit, doc),
|
29
30
|
contributor: fetch_contributor(hit, doc),
|
30
31
|
relation: fetch_relation(doc),
|
31
|
-
keyword: hit["keyword"]
|
32
|
+
keyword: hit["keyword"]
|
32
33
|
)
|
33
34
|
end
|
34
35
|
|
35
36
|
private
|
36
37
|
|
38
|
+
# @param hit [Hash]
|
39
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
40
|
+
def fetch_docid(hit)
|
41
|
+
id = hit["link"].split("/").last
|
42
|
+
[RelatonBib::DocumentIdentifier.new(id: id, type: "W3C")]
|
43
|
+
end
|
44
|
+
|
37
45
|
# @param hit [Hash]
|
38
46
|
# @param doc [Nokogiri::HTML::Document]
|
39
47
|
# @return [Array<RelatonBib::TypedTitleString>]
|
40
|
-
def fetch_title(hit, doc)
|
48
|
+
def fetch_title(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
41
49
|
titles = []
|
42
50
|
if doc
|
43
|
-
title = doc.at("//h1[@id
|
44
|
-
titles << { content: title, type: "main" }
|
45
|
-
subtitle = doc.at(
|
46
|
-
|
51
|
+
title = doc.at("//h1[contains(@id, 'title')]")&.text
|
52
|
+
titles << { content: title, type: "main" } if title
|
53
|
+
subtitle = doc.at(
|
54
|
+
"//h2[@id='subtitle']|//p[contains(@class, 'subline')]"
|
55
|
+
)&.text
|
56
|
+
titles << { content: subtitle, tipe: "subtitle" } if subtitle
|
47
57
|
elsif hit["title"]
|
48
58
|
titles << { content: hit["title"], type: "main" }
|
49
59
|
end
|
50
60
|
titles.map do |t|
|
51
61
|
title = RelatonBib::FormattedString.new(
|
52
|
-
content: t[:content], language: "en", script: "Latn"
|
62
|
+
content: t[:content], language: "en", script: "Latn"
|
53
63
|
)
|
54
64
|
RelatonBib::TypedTitleString.new(type: t[:type], title: title)
|
55
65
|
end
|
@@ -75,10 +85,27 @@ module RelatonW3c
|
|
75
85
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
76
86
|
# @return [Array<RelatonBib::BibliographicDate>]
|
77
87
|
def fetch_date(hit, doc)
|
78
|
-
on = hit["datepub"] || doc
|
88
|
+
on = hit["datepub"] || doc&.at("//h2/time[@datetime]")&.attr(:datetime)
|
89
|
+
on ||= fetch_date1(doc) || fetch_date2(doc)
|
79
90
|
[RelatonBib::BibliographicDate.new(type: "published", on: on)] if on
|
80
91
|
end
|
81
92
|
|
93
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
94
|
+
# @return [String]
|
95
|
+
def fetch_date1(doc)
|
96
|
+
d = doc&.at("//h2[@property='dc:issued']")&.attr(:content)
|
97
|
+
d&.match(/\d{4}-\d{2}-\d{2}/)&.to_s
|
98
|
+
end
|
99
|
+
|
100
|
+
# @param doc [Nokogiri::HTML::Document, NilClass]
|
101
|
+
# @return [String]
|
102
|
+
def fetch_date2(doc)
|
103
|
+
d = doc&.at("//h2[contains(@id, 'w3c-recommendation')]")
|
104
|
+
return unless d
|
105
|
+
|
106
|
+
Date.parse(d.attr(:id.match(/\d{2}-\w+-\d{4}/).to_s)).to_s
|
107
|
+
end
|
108
|
+
|
82
109
|
# @param hit [Hash]
|
83
110
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
84
111
|
# @return [String]
|
@@ -96,17 +123,19 @@ module RelatonW3c
|
|
96
123
|
# @param hit [Hash]
|
97
124
|
# @param doc [Nokogiri::HTML::Document, NilClass]
|
98
125
|
# @return [Array<RelatonBib::ContributionInfo>]
|
99
|
-
def fetch_contributor(hit, doc)
|
126
|
+
def fetch_contributor(hit, doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
100
127
|
if doc
|
101
|
-
editors = find_contribs(doc, "Editors").
|
102
|
-
parse_contrib ed, "editor"
|
128
|
+
editors = find_contribs(doc, "Editors").reduce([]) do |mem, ed|
|
129
|
+
c = parse_contrib ed, "editor"
|
130
|
+
mem << c if c
|
131
|
+
mem
|
103
132
|
end
|
104
|
-
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem,
|
105
|
-
ed = mem.detect { |e| e[:id] && e[:id] ==
|
133
|
+
contribs = find_contribs(doc, "Authors").reduce(editors) do |mem, ath|
|
134
|
+
ed = mem.detect { |e| e[:id] && e[:id] == ath["data-editor-id"] }
|
106
135
|
if ed
|
107
136
|
ed[:role] << { type: "author" }
|
108
137
|
else
|
109
|
-
mem << parse_contrib(
|
138
|
+
mem << parse_contrib(ath, "author")
|
110
139
|
end
|
111
140
|
mem
|
112
141
|
end
|
@@ -131,6 +160,8 @@ module RelatonW3c
|
|
131
160
|
# @return [Hash]
|
132
161
|
def parse_contrib(element, type)
|
133
162
|
p = element.at("a")
|
163
|
+
return unless p
|
164
|
+
|
134
165
|
contrib = {
|
135
166
|
name: p.text,
|
136
167
|
url: p[:href],
|
data/lib/relaton_w3c/version.rb
CHANGED
@@ -5,35 +5,12 @@ module RelatonW3c
|
|
5
5
|
proposedRecommendation recommendation retired workingDraft
|
6
6
|
].freeze
|
7
7
|
|
8
|
-
attr_reader :doctype
|
9
|
-
|
10
8
|
# @param doctype [String]
|
11
9
|
def initialize(**args)
|
12
10
|
if args[:doctype] && !TYPES.include?(args[:doctype])
|
13
11
|
warn "[relaton-w3c] invalid document type: #{args[:doctype]}"
|
14
12
|
end
|
15
|
-
@doctype = args.delete :doctype
|
16
13
|
super **args
|
17
14
|
end
|
18
|
-
|
19
|
-
# @param builder [Nokogiri::XML::Builder, NilClass]
|
20
|
-
# @param opts [Hash]
|
21
|
-
# @option opts [TrueClass, FalseClass, NilClass] bibdata
|
22
|
-
def to_xml(builder = nil, **opts)
|
23
|
-
super builder, **opts do |b|
|
24
|
-
if opts[:bibdata] && doctype
|
25
|
-
b.ext do |e|
|
26
|
-
e.doctype doctype if doctype
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# @return [Hash]
|
33
|
-
def to_hash
|
34
|
-
hash = super
|
35
|
-
hash["doctype"] = doctype if doctype
|
36
|
-
hash
|
37
|
-
end
|
38
15
|
end
|
39
16
|
end
|
@@ -1,19 +1,6 @@
|
|
1
1
|
module RelatonW3c
|
2
2
|
class XMLParser < RelatonBib::XMLParser
|
3
3
|
class << self
|
4
|
-
# @param xml [String]
|
5
|
-
# @return [RelatonW3c::W3cBibliographicItem, NilClass]
|
6
|
-
def from_xml(xml)
|
7
|
-
doc = Nokogiri::XML xml
|
8
|
-
doc.remove_namespaces!
|
9
|
-
item = doc.at("/bibitem|/bibdata")
|
10
|
-
if item
|
11
|
-
W3cBibliographicItem.new(item_data(item))
|
12
|
-
else
|
13
|
-
warn "[relaton-w3c] can't find bibitem or bibdata element in the XML"
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
4
|
private
|
18
5
|
|
19
6
|
# Override RelatonBib::XMLParser.item_data method.
|
@@ -27,6 +14,12 @@ module RelatonW3c
|
|
27
14
|
data[:doctype] = ext.at("./doctype")&.text
|
28
15
|
data
|
29
16
|
end
|
17
|
+
|
18
|
+
# @param item_hash [Hash]
|
19
|
+
# @return [RelatonBib::BibliographicItem]
|
20
|
+
def bib_item(item_hash)
|
21
|
+
W3cBibliographicItem.new item_hash
|
22
|
+
end
|
30
23
|
end
|
31
24
|
end
|
32
25
|
end
|
data/relaton_w3c.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-w3c
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - ">="
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.0
|
117
|
+
version: 1.3.0
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.0
|
124
|
+
version: 1.3.0
|
125
125
|
description: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
126
126
|
model'
|
127
127
|
email:
|
@@ -162,7 +162,7 @@ licenses:
|
|
162
162
|
- BSD-2-Clause
|
163
163
|
metadata:
|
164
164
|
homepage_uri: https://github.com/relaton/relaton-wc3
|
165
|
-
post_install_message:
|
165
|
+
post_install_message:
|
166
166
|
rdoc_options: []
|
167
167
|
require_paths:
|
168
168
|
- lib
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubygems_version: 3.0.6
|
181
|
-
signing_key:
|
181
|
+
signing_key:
|
182
182
|
specification_version: 4
|
183
183
|
summary: 'RelatonIso: retrieve W3C Standards for bibliographic using the IsoBibliographicItem
|
184
184
|
model'
|