relaton-ecma 2.0.0.pre.alpha.2 → 2.0.0.pre.alpha.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +2 -2
- data/grammars/basicdoc.rng +14 -1
- data/grammars/biblio.rng +8 -8
- data/lib/relaton/ecma/bibliography.rb +2 -0
- data/lib/relaton/ecma/data_fetcher.rb +11 -1
- data/lib/relaton/ecma/data_parser.rb +25 -211
- data/lib/relaton/ecma/edition_parser.rb +80 -0
- data/lib/relaton/ecma/memento_parser.rb +60 -0
- data/lib/relaton/ecma/page_fetcher.rb +27 -0
- data/lib/relaton/ecma/parser_common.rb +33 -0
- data/lib/relaton/ecma/standard_parser.rb +134 -0
- data/lib/relaton/ecma/version.rb +1 -1
- data/relaton_ecma.gemspec +2 -2
- metadata +10 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ada2f96de510e80052f8d09ccb04ffa43718c7582eea853a10f1bcf30b39754f
|
|
4
|
+
data.tar.gz: 7400f8754bbaf45d9718c1855ea1a5b0363fe8c8080dbfef18177e6c4cddd5ae
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a685d9c330a66bd2907eccd9fba5bdb0fa439994e65a931652c045fc5f0ad8183dd7a9ff7afca0f56ed80523562d18e15527c0f35a96617c19c41952a1a5fba2
|
|
7
|
+
data.tar.gz: b7b9a2fbdb69b84bc24f9af125733b8ee43c3631bdf1dde3407170d1051562dd2ee3de957d5fc07d6ca15a1b53c6548644c23032556ee5d6116be1c1b6a0a4f1
|
data/README.adoc
CHANGED
|
@@ -108,7 +108,7 @@ Relaton::Ecma::Bibliography.get '1111'
|
|
|
108
108
|
[source,ruby]
|
|
109
109
|
----
|
|
110
110
|
item.to_xml
|
|
111
|
-
=> "<bibitem id="ECMA6" type="standard" schema-version="v1.
|
|
111
|
+
=> "<bibitem id="ECMA6" type="standard" schema-version="v1.5.6">
|
|
112
112
|
<fetched>2025-12-29</fetched>
|
|
113
113
|
<title language="en" script="Latn">7-bit coded character set</title>
|
|
114
114
|
<uri type="src">https://ecma-international.org/publications-and-standards/standards/ecma-6/</uri>
|
|
@@ -122,7 +122,7 @@ With `bibdata: true` option XML output wrapped with `bibdata` element and `ext`
|
|
|
122
122
|
[source,ruby]
|
|
123
123
|
----
|
|
124
124
|
item.to_xml bibdata: true
|
|
125
|
-
=> "<bibdata type="standard" schema-version="v1.
|
|
125
|
+
=> "<bibdata type="standard" schema-version="v1.5.6">
|
|
126
126
|
<fetched>2025-12-29</fetched>
|
|
127
127
|
<title language="en" script="Latn">7-bit coded character set</title>
|
|
128
128
|
<uri type="src">https://ecma-international.org/publications-and-standards/standards/ecma-6/</uri>
|
data/grammars/basicdoc.rng
CHANGED
|
@@ -187,6 +187,15 @@ Applicable to modify and delete</a:documentation>
|
|
|
187
187
|
<a:documentation>Optional caption of this block</a:documentation>
|
|
188
188
|
</attribute>
|
|
189
189
|
</optional>
|
|
190
|
+
<optional>
|
|
191
|
+
<attribute name="position">
|
|
192
|
+
<a:documentation>For an "add" change, whether the change is added before or after the location</a:documentation>
|
|
193
|
+
<choice>
|
|
194
|
+
<value>before</value>
|
|
195
|
+
<value>after</value>
|
|
196
|
+
</choice>
|
|
197
|
+
</attribute>
|
|
198
|
+
</optional>
|
|
190
199
|
<optional>
|
|
191
200
|
<element name="location">
|
|
192
201
|
<a:documentation>The location(s) in the original document which have undergone the change described in this block</a:documentation>
|
|
@@ -208,11 +217,15 @@ Applicable to modify and delete</a:documentation>
|
|
|
208
217
|
</zeroOrMore>
|
|
209
218
|
<optional>
|
|
210
219
|
<element name="newcontent">
|
|
211
|
-
<a:documentation>New content to be added to the document; applicable to add and modify
|
|
220
|
+
<a:documentation>New content to be added to the document; applicable to add and modify.
|
|
221
|
+
Can be blocks and/or sections</a:documentation>
|
|
212
222
|
<ref name="OptionalId"/>
|
|
213
223
|
<zeroOrMore>
|
|
214
224
|
<ref name="BasicBlock"/>
|
|
215
225
|
</zeroOrMore>
|
|
226
|
+
<zeroOrMore>
|
|
227
|
+
<ref name="section"/>
|
|
228
|
+
</zeroOrMore>
|
|
216
229
|
</element>
|
|
217
230
|
</optional>
|
|
218
231
|
<zeroOrMore>
|
data/grammars/biblio.rng
CHANGED
|
@@ -1142,11 +1142,11 @@ NOTE: This should preferably be encoded as a URI or short identifier, rather th
|
|
|
1142
1142
|
<a:documentation>Information about how long the current description of the bibliographic item is valid for</a:documentation>
|
|
1143
1143
|
</ref>
|
|
1144
1144
|
</optional>
|
|
1145
|
-
<
|
|
1145
|
+
<zeroOrMore>
|
|
1146
1146
|
<ref name="depiction">
|
|
1147
1147
|
<a:documentation>Depiction of the bibliographic item, typically an image</a:documentation>
|
|
1148
1148
|
</ref>
|
|
1149
|
-
</
|
|
1149
|
+
</zeroOrMore>
|
|
1150
1150
|
</define>
|
|
1151
1151
|
<define name="ReducedBibliographicItem">
|
|
1152
1152
|
<a:documentation>Reduced description of a bibliographic resource, without mandatory title and docidentifier, used for document relations
|
|
@@ -1939,10 +1939,10 @@ Detailed in https://www.relaton.org/model/relations/</a:documentation>
|
|
|
1939
1939
|
<value>hasAnnotation</value>
|
|
1940
1940
|
<value>draftOf</value>
|
|
1941
1941
|
<value>hasDraft</value>
|
|
1942
|
-
<value>
|
|
1943
|
-
<value>
|
|
1944
|
-
<value>
|
|
1945
|
-
<value>
|
|
1942
|
+
<value>predecessorDraftOf</value>
|
|
1943
|
+
<value>hasPredecessorDraft</value>
|
|
1944
|
+
<value>successorDraftOf</value>
|
|
1945
|
+
<value>hasSuccessorDraft</value>
|
|
1946
1946
|
<value>editionOf</value>
|
|
1947
1947
|
<value>hasEdition</value>
|
|
1948
1948
|
<value>updates</value>
|
|
@@ -2063,13 +2063,13 @@ provided that it is not the entire bibliographic item that is so related</a:docu
|
|
|
2063
2063
|
<ref name="LocalizedString"/>
|
|
2064
2064
|
</element>
|
|
2065
2065
|
</optional>
|
|
2066
|
-
<
|
|
2066
|
+
<zeroOrMore>
|
|
2067
2067
|
<element name="taxon">
|
|
2068
2068
|
<a:documentation>The keywords as a hierarchical taxonomy. For example, the sequence of `taxon` elements
|
|
2069
2069
|
`pump`, `centrifugal pump`, `line shaft pump` represents a taxonomic classification</a:documentation>
|
|
2070
2070
|
<ref name="LocalizedString"/>
|
|
2071
2071
|
</element>
|
|
2072
|
-
</
|
|
2072
|
+
</zeroOrMore>
|
|
2073
2073
|
<zeroOrMore>
|
|
2074
2074
|
<ref name="vocabid">
|
|
2075
2075
|
<a:documentation>Identifiers for the keyword as a controlled vocabulary</a:documentation>
|
|
@@ -4,6 +4,11 @@ require "English"
|
|
|
4
4
|
require "mechanize"
|
|
5
5
|
require "relaton/core"
|
|
6
6
|
require_relative "../ecma"
|
|
7
|
+
require_relative "parser_common"
|
|
8
|
+
require_relative "page_fetcher"
|
|
9
|
+
require_relative "standard_parser"
|
|
10
|
+
require_relative "memento_parser"
|
|
11
|
+
require_relative "edition_parser"
|
|
7
12
|
require_relative "data_parser"
|
|
8
13
|
|
|
9
14
|
module Relaton
|
|
@@ -16,6 +21,10 @@ module Relaton
|
|
|
16
21
|
@index ||= Relaton::Index.find_or_create :ecma, file: "#{INDEXFILE}.yaml"
|
|
17
22
|
end
|
|
18
23
|
|
|
24
|
+
def log_error(msg)
|
|
25
|
+
Util.error msg
|
|
26
|
+
end
|
|
27
|
+
|
|
19
28
|
def agent
|
|
20
29
|
@agent ||= Mechanize.new.tap { |a| a.user_agent_alias = Mechanize::AGENT_ALIASES.keys.sample }
|
|
21
30
|
end
|
|
@@ -62,7 +71,7 @@ module Relaton
|
|
|
62
71
|
|
|
63
72
|
# @param hit [Nokogiri::XML::Element]
|
|
64
73
|
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
65
|
-
DataParser.new(hit).parse.each { |item| write_file item }
|
|
74
|
+
DataParser.new(hit, @errors).parse.each { |item| write_file item }
|
|
66
75
|
end
|
|
67
76
|
|
|
68
77
|
# @param type [String]
|
|
@@ -87,6 +96,7 @@ module Relaton
|
|
|
87
96
|
def fetch(_ = nil)
|
|
88
97
|
SOURCES.each { |source| html_index source }
|
|
89
98
|
index.save
|
|
99
|
+
report_errors
|
|
90
100
|
end
|
|
91
101
|
end
|
|
92
102
|
end
|
|
@@ -1,235 +1,49 @@
|
|
|
1
1
|
module Relaton
|
|
2
2
|
module Ecma
|
|
3
3
|
class DataParser
|
|
4
|
-
|
|
5
|
-
ATTRS = MATTRS + %i[abstract relation edition ext].freeze
|
|
4
|
+
include ParserCommon
|
|
6
5
|
|
|
7
6
|
#
|
|
8
7
|
# Initialize parser
|
|
9
8
|
#
|
|
10
9
|
# @param [Nokogiri::XML::Element] hit document hit
|
|
10
|
+
# @param [Hash] errors error tracking hash
|
|
11
11
|
#
|
|
12
|
-
def initialize(hit)
|
|
12
|
+
def initialize(hit, errors = {})
|
|
13
13
|
@hit = hit
|
|
14
|
-
@
|
|
15
|
-
type: "standard", language: ["en"], script: ["Latn"], place: [Bib::Place.new(city: "Geneva")]
|
|
16
|
-
}
|
|
17
|
-
@agent = Mechanize.new
|
|
14
|
+
@errors = errors
|
|
18
15
|
end
|
|
19
16
|
|
|
20
|
-
|
|
17
|
+
# @return [Array<Relaton::Ecma::ItemData>]
|
|
18
|
+
def parse
|
|
21
19
|
if @hit[:href]
|
|
22
|
-
|
|
23
|
-
@doc = get_page @hit[:href]
|
|
24
|
-
ATTRS.each { |a| @bib[a] = send "fetch_#{a}" }
|
|
20
|
+
parse_standard
|
|
25
21
|
else
|
|
26
|
-
|
|
22
|
+
parse_memento
|
|
27
23
|
end
|
|
28
|
-
@bib[:contributor] = contributor
|
|
29
|
-
items = [ItemData.new(**@bib)]
|
|
30
|
-
items + parse_editions
|
|
31
24
|
end
|
|
32
25
|
|
|
33
|
-
|
|
34
|
-
# Get page with retries
|
|
35
|
-
#
|
|
36
|
-
# @param [String] url url to fetch
|
|
37
|
-
#
|
|
38
|
-
# @return [Mechanize::Page] document
|
|
39
|
-
#
|
|
40
|
-
def get_page(url)
|
|
41
|
-
3.times do |n|
|
|
42
|
-
sleep n
|
|
43
|
-
doc = @agent.get url
|
|
44
|
-
return doc
|
|
45
|
-
rescue StandardError => e
|
|
46
|
-
Util.error e.message
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
#
|
|
51
|
-
# Parse editions
|
|
52
|
-
#
|
|
53
|
-
# @param [Mechanize::Page] doc document
|
|
54
|
-
# @param [Hash] bib bibliographic item the last edition
|
|
55
|
-
#
|
|
56
|
-
# @return [Array<Relaton::Ecma::ItemData>] editions
|
|
57
|
-
#
|
|
58
|
-
def parse_editions # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
59
|
-
return [] unless @doc
|
|
60
|
-
|
|
61
|
-
docid = @bib[:docid]
|
|
62
|
-
@doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit|
|
|
63
|
-
id, ed, @bib[:date], vol = edition_id_parts hit.at("./span", "./a").text
|
|
64
|
-
@bib[:source] = edition_source(hit) + edition_translation_source(ed)
|
|
65
|
-
next if ed.nil? || ed.empty?
|
|
66
|
-
|
|
67
|
-
@bib[:docidentifier] = id.nil? || id.empty? ? docid : fetch_docidentifier(id)
|
|
68
|
-
@bib[:edition] = Bib::Edition.new(content: ed)
|
|
69
|
-
@bib[:extent] = create_extent(vol)
|
|
70
|
-
ItemData.new(**@bib)
|
|
71
|
-
end.compact
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def create_extent(vol)
|
|
75
|
-
return unless vol && !vol.empty?
|
|
76
|
-
|
|
77
|
-
locality = Bib::Locality.new(type: "volume", reference_from: vol)
|
|
78
|
-
[Bib::Extent.new(locality: [locality])]
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
def edition_source(hit)
|
|
82
|
-
{ "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a|
|
|
83
|
-
Bib::Uri.new(type: type, content: a[:href]) if a
|
|
84
|
-
end.compact
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
#
|
|
88
|
-
# Parse edition and date
|
|
89
|
-
#
|
|
90
|
-
# @param [String] text identifier text
|
|
91
|
-
#
|
|
92
|
-
# @return [Array<String,nil,Array<Relaton::Bib::Date>>] edition and date
|
|
93
|
-
#
|
|
94
|
-
def edition_id_parts(text) # rubocop:disable Metrics/MethodLength
|
|
95
|
-
%r{^
|
|
96
|
-
(?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s
|
|
97
|
-
(?:Volume\s(?<vol>[\d.]+),?\s)?
|
|
98
|
-
(?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition
|
|
99
|
-
(?:[,.]\s(?<dt>\w+\s\d+))?
|
|
100
|
-
}x =~ text
|
|
101
|
-
date = [dt].compact.map do |d|
|
|
102
|
-
on = Date.strptime(d, "%B %Y").strftime("%Y-%m")
|
|
103
|
-
Bib::Date.new(type: "published", at: on)
|
|
104
|
-
end
|
|
105
|
-
[id, ed, date, vol]
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
# @return [Array<Relaton::Bib::Docidentifier>]
|
|
109
|
-
def fetch_docidentifier(id = nil)
|
|
110
|
-
id ||= @hit.text
|
|
111
|
-
[Bib::Docidentifier.new(type: "ECMA", content: id, primary: true)]
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# @return [Array<Relaton::Bib::Uri>]
|
|
115
|
-
def fetch_source # rubocop:disable Metrics/AbcSize
|
|
116
|
-
source = []
|
|
117
|
-
source << Bib::Uri.new(type: "src", content: @hit[:href]) if @hit[:href]
|
|
118
|
-
ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a',
|
|
119
|
-
'//div[@class="ecma-item-content-wrapper"]/a')
|
|
120
|
-
source << Bib::Uri.new(type: "pdf", content: ref[:href]) if ref
|
|
121
|
-
source + edition_translation_source(@bib[:edition]&.content)
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def fetch_mem_source
|
|
125
|
-
@hit.xpath("./div/section/div/p/a").map do |a|
|
|
126
|
-
Bib::Uri.new(type: "pdf", content: a[:href])
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
|
|
130
|
-
def edition_translation_source(edition)
|
|
131
|
-
translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
def translation_source
|
|
135
|
-
return [] unless @doc
|
|
136
|
-
|
|
137
|
-
@translation_source ||= @doc.xpath("//h2[.='Translations']/following-sibling::ul/li").map do |l|
|
|
138
|
-
a = l.at("span/a")
|
|
139
|
-
id = l.at("span").text
|
|
140
|
-
%r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id
|
|
141
|
-
case lang
|
|
142
|
-
when "Japanese"
|
|
143
|
-
{ ed: ed, source: Bib::Uri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) }
|
|
144
|
-
end
|
|
145
|
-
end.compact
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
# @return [Array<Relaton::Bib::Title>]
|
|
149
|
-
def fetch_title
|
|
150
|
-
@doc.xpath('//p[@class="ecma-item-short-description"]').map do |t|
|
|
151
|
-
Bib::Title.new(content: t.text.strip, language: "en", script: "Latn")
|
|
152
|
-
end
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
# @return [Array<Relaton::Bib::LocalizedMarkedUpString>]
|
|
156
|
-
def fetch_abstract
|
|
157
|
-
content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a|
|
|
158
|
-
a.text.strip.squeeze(" ").gsub("\r\n", "")
|
|
159
|
-
end.join "\n"
|
|
160
|
-
return [] if content.empty?
|
|
161
|
-
|
|
162
|
-
[Bib::LocalizedMarkedUpString.new(content: content, language: "en", script: "Latn")]
|
|
163
|
-
end
|
|
164
|
-
|
|
165
|
-
# @return [Array<Relaton::Bib::Date>]
|
|
166
|
-
def fetch_date
|
|
167
|
-
@doc.xpath('//p[@class="ecma-item-edition"]').map do |d|
|
|
168
|
-
date = d.text.split(", ").last
|
|
169
|
-
Bib::Date.new type: "published", at: date
|
|
170
|
-
end
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
# @return [Array<Relaton::Bib::Relation>]
|
|
174
|
-
def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
|
|
175
|
-
@doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel|
|
|
176
|
-
ref, ed, date, vol = edition_id_parts rel.at("span").text
|
|
177
|
-
next if ed.nil? || ed.empty?
|
|
26
|
+
private
|
|
178
27
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
#
|
|
192
|
-
# @return [Relaton::Bib::Edition, nil]
|
|
193
|
-
#
|
|
194
|
-
def fetch_edition
|
|
195
|
-
cnt = @doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s
|
|
196
|
-
Bib::Edition.new(content: cnt) if cnt && !cnt.empty?
|
|
197
|
-
end
|
|
198
|
-
|
|
199
|
-
def contributor
|
|
200
|
-
orgname = Bib::TypedLocalizedString.new(content: "Ecma International", language: "en", script: "Latn")
|
|
201
|
-
org = Bib::Organization.new name: [orgname]
|
|
202
|
-
role = Bib::Contributor::Role.new type: "publisher"
|
|
203
|
-
[Bib::Contributor.new(organization: org, role: [role])]
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
# @return [Array<Relaton::Bib::Docidentifier>]
|
|
207
|
-
def fetch_mem_docidentifier
|
|
208
|
-
code = "ECMA MEM/#{@hit.at('div[1]//p').text}"
|
|
209
|
-
fetch_docidentifier code
|
|
210
|
-
end
|
|
211
|
-
|
|
212
|
-
def fetch_mem_date
|
|
213
|
-
date = @hit.at("div[2]//p").text
|
|
214
|
-
on = Date.strptime(date, "%B %Y").strftime "%Y-%m"
|
|
215
|
-
[Bib::Date.new(type: "published", at: on)]
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
def fetch_mem_title
|
|
219
|
-
year = @hit.at("div[1]//p").text
|
|
220
|
-
content = "\"Memento #{year}\" for year #{year}"
|
|
221
|
-
[Bib::Title.new(content: content, language: "en", script: "Latn")]
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
def fetch_ext
|
|
225
|
-
Ext.new(doctype: fetch_doctype, flavor: "ecma")
|
|
28
|
+
def parse_standard
|
|
29
|
+
doc = PageFetcher.new.get(@hit[:href])
|
|
30
|
+
parser = StandardParser.new(hit: @hit, doc: doc, errors: @errors)
|
|
31
|
+
bib = parser.to_bib_hash
|
|
32
|
+
bib[:contributor] = contributor
|
|
33
|
+
items = [ItemData.new(**bib)]
|
|
34
|
+
edition_parser = EditionParser.new(
|
|
35
|
+
doc: doc, bib: bib, errors: @errors,
|
|
36
|
+
translation_source: parser.translation_source
|
|
37
|
+
)
|
|
38
|
+
items + edition_parser.parse
|
|
226
39
|
end
|
|
227
40
|
|
|
228
|
-
def
|
|
229
|
-
|
|
41
|
+
def parse_memento
|
|
42
|
+
parser = MementoParser.new(hit: @hit, errors: @errors)
|
|
43
|
+
bib = parser.to_bib_hash
|
|
44
|
+
bib[:contributor] = contributor
|
|
45
|
+
[ItemData.new(**bib)]
|
|
230
46
|
end
|
|
231
|
-
|
|
232
|
-
alias_method :fetch_mem_ext, :fetch_ext
|
|
233
47
|
end
|
|
234
48
|
end
|
|
235
49
|
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
module Relaton
|
|
2
|
+
module Ecma
|
|
3
|
+
class EditionParser
|
|
4
|
+
include ParserCommon
|
|
5
|
+
|
|
6
|
+
# @param [Mechanize::Page] doc document page
|
|
7
|
+
# @param [Hash] bib base bibliographic item attributes
|
|
8
|
+
# @param [Hash] errors error tracking hash
|
|
9
|
+
# @param [Array] translation_source precomputed translation sources
|
|
10
|
+
def initialize(doc:, bib:, errors: {}, translation_source: [])
|
|
11
|
+
@doc = doc
|
|
12
|
+
@bib = bib
|
|
13
|
+
@errors = errors
|
|
14
|
+
@translation_source = translation_source
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @return [Array<Relaton::Ecma::ItemData>] editions
|
|
18
|
+
def parse # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
19
|
+
return [] unless @doc
|
|
20
|
+
|
|
21
|
+
docid = @bib[:docidentifier]
|
|
22
|
+
@doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit|
|
|
23
|
+
bib = @bib.dup
|
|
24
|
+
id, ed, bib[:date], vol = edition_id_parts hit.at("./span", "./a").text
|
|
25
|
+
bib[:source] = edition_source(hit) + edition_translation_source(ed)
|
|
26
|
+
next if ed.nil? || ed.empty?
|
|
27
|
+
|
|
28
|
+
bib[:docidentifier] = id.nil? || id.empty? ? docid : fetch_docidentifier(id)
|
|
29
|
+
@errors[:edition_docidentifier] &&= bib[:docidentifier].empty?
|
|
30
|
+
bib[:edition] = Bib::Edition.new(content: ed)
|
|
31
|
+
bib[:extent] = create_extent(vol)
|
|
32
|
+
@errors[:edition_extent] &&= bib[:extent].nil?
|
|
33
|
+
ItemData.new(**bib)
|
|
34
|
+
end.compact
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
#
|
|
38
|
+
# Parse edition and date
|
|
39
|
+
#
|
|
40
|
+
# @param [String] text identifier text
|
|
41
|
+
#
|
|
42
|
+
# @return [Array<String,nil,Array<Relaton::Bib::Date>>] edition and date
|
|
43
|
+
#
|
|
44
|
+
def edition_id_parts(text) # rubocop:disable Metrics/MethodLength
|
|
45
|
+
%r{^
|
|
46
|
+
(?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s
|
|
47
|
+
(?:Volume\s(?<vol>[\d.]+),?\s)?
|
|
48
|
+
(?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition
|
|
49
|
+
(?:[,.]\s(?<dt>\w+\s\d+))?
|
|
50
|
+
}x =~ text
|
|
51
|
+
date = [dt].compact.map do |d|
|
|
52
|
+
on = Date.strptime(d, "%B %Y").strftime("%Y-%m")
|
|
53
|
+
Bib::Date.new(type: "published", at: on)
|
|
54
|
+
end
|
|
55
|
+
[id, ed, date, vol]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def edition_source(hit)
|
|
59
|
+
es = { "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a|
|
|
60
|
+
Bib::Uri.new(type: type, content: a[:href]) if a
|
|
61
|
+
end.compact
|
|
62
|
+
@errors[:edition_source] &&= es.empty?
|
|
63
|
+
es
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def create_extent(vol)
|
|
67
|
+
return unless vol && !vol.empty?
|
|
68
|
+
|
|
69
|
+
locality = Bib::Locality.new(type: "volume", reference_from: vol)
|
|
70
|
+
[Bib::Extent.new(locality: [locality])]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def edition_translation_source(edition)
|
|
76
|
+
@translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
module Relaton
|
|
2
|
+
module Ecma
|
|
3
|
+
class MementoParser
|
|
4
|
+
include ParserCommon
|
|
5
|
+
|
|
6
|
+
ATTRS = %i[docidentifier title date source ext].freeze
|
|
7
|
+
|
|
8
|
+
# @param [Nokogiri::XML::Element] hit document hit
|
|
9
|
+
# @param [Hash] errors error tracking hash
|
|
10
|
+
def initialize(hit:, errors: {})
|
|
11
|
+
@hit = hit
|
|
12
|
+
@errors = errors
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @return [Hash] bibliographic item attributes
|
|
16
|
+
def to_bib_hash
|
|
17
|
+
bib = default_bib_hash
|
|
18
|
+
ATTRS.each { |a| bib[a] = send "fetch_#{a}" }
|
|
19
|
+
bib
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
# @return [Array<Relaton::Bib::Docidentifier>]
|
|
25
|
+
def fetch_docidentifier
|
|
26
|
+
code = "ECMA MEM/#{@hit.at('div[1]//p').text}"
|
|
27
|
+
docid = super(code)
|
|
28
|
+
@errors[:memento_docidentifier] &&= docid.empty?
|
|
29
|
+
docid
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @return [Array<Relaton::Bib::Title>]
|
|
33
|
+
def fetch_title
|
|
34
|
+
year = @hit.at("div[1]//p").text
|
|
35
|
+
content = "\"Memento #{year}\" for year #{year}"
|
|
36
|
+
result = [Bib::Title.new(content: content, language: "en", script: "Latn")]
|
|
37
|
+
@errors[:memento_title] &&= result.empty?
|
|
38
|
+
result
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# @return [Array<Relaton::Bib::Date>]
|
|
42
|
+
def fetch_date
|
|
43
|
+
date = @hit.at("div[2]//p").text
|
|
44
|
+
on = Date.strptime(date, "%B %Y").strftime "%Y-%m"
|
|
45
|
+
result = [Bib::Date.new(type: "published", at: on)]
|
|
46
|
+
@errors[:memento_date] &&= result.empty?
|
|
47
|
+
result
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# @return [Array<Relaton::Bib::Uri>]
|
|
51
|
+
def fetch_source
|
|
52
|
+
result = @hit.xpath("./div/section/div/p/a").map do |a|
|
|
53
|
+
Bib::Uri.new(type: "pdf", content: a[:href])
|
|
54
|
+
end
|
|
55
|
+
@errors[:memento_source] &&= result.empty?
|
|
56
|
+
result
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Relaton
|
|
2
|
+
module Ecma
|
|
3
|
+
class PageFetcher
|
|
4
|
+
def initialize
|
|
5
|
+
@agent = Mechanize.new
|
|
6
|
+
@agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)]
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
#
|
|
10
|
+
# Get page with retries
|
|
11
|
+
#
|
|
12
|
+
# @param [String] url url to fetch
|
|
13
|
+
#
|
|
14
|
+
# @return [Mechanize::Page] document
|
|
15
|
+
#
|
|
16
|
+
def get(url)
|
|
17
|
+
3.times do |n|
|
|
18
|
+
sleep n
|
|
19
|
+
doc = @agent.get url
|
|
20
|
+
return doc
|
|
21
|
+
rescue StandardError => e
|
|
22
|
+
Util.error e.message
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module Relaton
|
|
2
|
+
module Ecma
|
|
3
|
+
module ParserCommon
|
|
4
|
+
def default_bib_hash
|
|
5
|
+
{
|
|
6
|
+
type: "standard", language: ["en"], script: ["Latn"], place: [Bib::Place.new(city: "Geneva")]
|
|
7
|
+
}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def contributor
|
|
11
|
+
orgname = Bib::TypedLocalizedString.new(content: "Ecma International", language: "en", script: "Latn")
|
|
12
|
+
org = Bib::Organization.new name: [orgname]
|
|
13
|
+
role = Bib::Contributor::Role.new type: "publisher"
|
|
14
|
+
[Bib::Contributor.new(organization: org, role: [role])]
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @return [Array<Relaton::Bib::Docidentifier>]
|
|
18
|
+
def fetch_docidentifier(id = nil)
|
|
19
|
+
return [] if id.nil? || id.empty?
|
|
20
|
+
|
|
21
|
+
[Bib::Docidentifier.new(type: "ECMA", content: id, primary: true)]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def fetch_ext
|
|
25
|
+
Ext.new(doctype: fetch_doctype, flavor: "ecma")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def fetch_doctype
|
|
29
|
+
Bib::Doctype.new content: "document"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
module Relaton
|
|
2
|
+
module Ecma
|
|
3
|
+
class StandardParser
|
|
4
|
+
include ParserCommon
|
|
5
|
+
|
|
6
|
+
ATTRS = %i[docidentifier title date source abstract relation edition ext].freeze
|
|
7
|
+
|
|
8
|
+
# @param [Nokogiri::XML::Element] hit document hit
|
|
9
|
+
# @param [Mechanize::Page] doc fetched document page
|
|
10
|
+
# @param [Hash] errors error tracking hash
|
|
11
|
+
def initialize(hit:, doc:, errors: {})
|
|
12
|
+
@hit = hit
|
|
13
|
+
@doc = doc
|
|
14
|
+
@errors = errors
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @return [Hash] bibliographic item attributes
|
|
18
|
+
def to_bib_hash
|
|
19
|
+
bib = default_bib_hash
|
|
20
|
+
ATTRS.each { |a| bib[a] = send "fetch_#{a}" }
|
|
21
|
+
bib
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @return [Array] precomputed translation sources
|
|
25
|
+
def translation_source
|
|
26
|
+
@translation_source ||= parse_translation_source
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @return [Array<Relaton::Bib::Docidentifier>]
|
|
30
|
+
def fetch_docidentifier
|
|
31
|
+
result = super(@hit.text)
|
|
32
|
+
@errors[:standard_docidentifier] &&= result.empty?
|
|
33
|
+
result
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @return [Array<Relaton::Bib::Title>]
|
|
37
|
+
def fetch_title
|
|
38
|
+
result = @doc.xpath('//p[@class="ecma-item-short-description"]').map do |t|
|
|
39
|
+
Bib::Title.new(content: t.text.strip, language: "en", script: "Latn")
|
|
40
|
+
end
|
|
41
|
+
@errors[:standard_title] &&= result.empty?
|
|
42
|
+
result
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @return [Array<Relaton::Bib::LocalizedMarkedUpString>]
|
|
46
|
+
def fetch_abstract
|
|
47
|
+
content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a|
|
|
48
|
+
a.text.strip.squeeze(" ").gsub("\r\n", "")
|
|
49
|
+
end.join "\n"
|
|
50
|
+
return [] if content.empty?
|
|
51
|
+
|
|
52
|
+
result = [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
|
|
53
|
+
@errors[:standard_abstract] &&= result.empty?
|
|
54
|
+
result
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [Array<Relaton::Bib::Date>]
|
|
58
|
+
def fetch_date
|
|
59
|
+
result = @doc.xpath('//p[@class="ecma-item-edition"]').map do |d|
|
|
60
|
+
date = d.text.split(", ").last
|
|
61
|
+
Bib::Date.new type: "published", at: date
|
|
62
|
+
end
|
|
63
|
+
@errors[:standard_date] &&= result.empty?
|
|
64
|
+
result
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# @return [Array<Relaton::Bib::Uri>]
|
|
68
|
+
def fetch_source # rubocop:disable Metrics/AbcSize
|
|
69
|
+
source = []
|
|
70
|
+
source << Bib::Uri.new(type: "src", content: @hit[:href]) if @hit[:href]
|
|
71
|
+
ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a',
|
|
72
|
+
'//div[@class="ecma-item-content-wrapper"]/a')
|
|
73
|
+
source << Bib::Uri.new(type: "pdf", content: ref[:href]) if ref
|
|
74
|
+
result = source + edition_translation_source(fetch_edition_content)
|
|
75
|
+
@errors[:standard_source] &&= result.empty?
|
|
76
|
+
result
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @return [Array<Relaton::Bib::Relation>]
|
|
80
|
+
def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
|
|
81
|
+
edition_parser = EditionParser.new(doc: @doc, bib: {}, errors: @errors)
|
|
82
|
+
result = @doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel|
|
|
83
|
+
ref, ed, date, vol = edition_parser.edition_id_parts rel.at("span").text
|
|
84
|
+
next if ed.nil? || ed.empty?
|
|
85
|
+
|
|
86
|
+
docid = Bib::Docidentifier.new(type: "ECMA", content: ref, primary: true)
|
|
87
|
+
source = rel.xpath("span/a").map { |l| Bib::Uri.new type: "pdf", content: l[:href] }
|
|
88
|
+
edition = Bib::Edition.new content: ed
|
|
89
|
+
extent = edition_parser.create_extent(vol)
|
|
90
|
+
@errors[:standard_relation_extent] &&= extent.nil?
|
|
91
|
+
bibitem = ItemData.new(
|
|
92
|
+
docidentifier: [docid], formattedref: Bib::Formattedref.new(content: ref), date: date, edition: edition,
|
|
93
|
+
source: source, extent: extent
|
|
94
|
+
)
|
|
95
|
+
Bib::Relation.new(type: "updates", bibitem: bibitem)
|
|
96
|
+
end
|
|
97
|
+
@errors[:standard_relation] &&= result.empty?
|
|
98
|
+
result
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# @return [Relaton::Bib::Edition, nil]
|
|
102
|
+
def fetch_edition
|
|
103
|
+
cnt = fetch_edition_content
|
|
104
|
+
result = Bib::Edition.new(content: cnt) if cnt && !cnt.empty?
|
|
105
|
+
@errors[:standard_edition] &&= result.nil?
|
|
106
|
+
result
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
def fetch_edition_content
|
|
112
|
+
@doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def edition_translation_source(edition)
|
|
116
|
+
translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def parse_translation_source
|
|
120
|
+
return [] unless @doc
|
|
121
|
+
|
|
122
|
+
@doc.xpath("//h2[.='Translations']/following-sibling::ul/li").map do |l|
|
|
123
|
+
a = l.at("span/a")
|
|
124
|
+
id = l.at("span").text
|
|
125
|
+
%r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id
|
|
126
|
+
case lang
|
|
127
|
+
when "Japanese"
|
|
128
|
+
{ ed: ed, source: Bib::Uri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) }
|
|
129
|
+
end
|
|
130
|
+
end.compact
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
data/lib/relaton/ecma/version.rb
CHANGED
data/relaton_ecma.gemspec
CHANGED
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
|
28
28
|
spec.require_paths = ["lib"]
|
|
29
29
|
|
|
30
30
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
31
|
-
spec.add_dependency "relaton-bib", "~> 2.0.0-alpha.
|
|
32
|
-
spec.add_dependency "relaton-core", "~> 0.0.
|
|
31
|
+
spec.add_dependency "relaton-bib", "~> 2.0.0-alpha.7"
|
|
32
|
+
spec.add_dependency "relaton-core", "~> 0.0.12"
|
|
33
33
|
spec.add_dependency "relaton-index", "~> 0.2.4"
|
|
34
34
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-ecma
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.0.0.pre.alpha.
|
|
4
|
+
version: 2.0.0.pre.alpha.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
@@ -29,28 +29,28 @@ dependencies:
|
|
|
29
29
|
requirements:
|
|
30
30
|
- - "~>"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 2.0.0.pre.alpha.
|
|
32
|
+
version: 2.0.0.pre.alpha.7
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
36
36
|
requirements:
|
|
37
37
|
- - "~>"
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
|
-
version: 2.0.0.pre.alpha.
|
|
39
|
+
version: 2.0.0.pre.alpha.7
|
|
40
40
|
- !ruby/object:Gem::Dependency
|
|
41
41
|
name: relaton-core
|
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
|
43
43
|
requirements:
|
|
44
44
|
- - "~>"
|
|
45
45
|
- !ruby/object:Gem::Version
|
|
46
|
-
version: 0.0.
|
|
46
|
+
version: 0.0.12
|
|
47
47
|
type: :runtime
|
|
48
48
|
prerelease: false
|
|
49
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
50
50
|
requirements:
|
|
51
51
|
- - "~>"
|
|
52
52
|
- !ruby/object:Gem::Version
|
|
53
|
-
version: 0.0.
|
|
53
|
+
version: 0.0.12
|
|
54
54
|
- !ruby/object:Gem::Dependency
|
|
55
55
|
name: relaton-index
|
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -96,10 +96,15 @@ files:
|
|
|
96
96
|
- lib/relaton/ecma/bibliography.rb
|
|
97
97
|
- lib/relaton/ecma/data_fetcher.rb
|
|
98
98
|
- lib/relaton/ecma/data_parser.rb
|
|
99
|
+
- lib/relaton/ecma/edition_parser.rb
|
|
99
100
|
- lib/relaton/ecma/ext.rb
|
|
100
101
|
- lib/relaton/ecma/item.rb
|
|
101
102
|
- lib/relaton/ecma/item_data.rb
|
|
103
|
+
- lib/relaton/ecma/memento_parser.rb
|
|
104
|
+
- lib/relaton/ecma/page_fetcher.rb
|
|
105
|
+
- lib/relaton/ecma/parser_common.rb
|
|
102
106
|
- lib/relaton/ecma/processor.rb
|
|
107
|
+
- lib/relaton/ecma/standard_parser.rb
|
|
103
108
|
- lib/relaton/ecma/util.rb
|
|
104
109
|
- lib/relaton/ecma/version.rb
|
|
105
110
|
- relaton_ecma.gemspec
|