relaton-ecma 2.0.0.pre.alpha.2 → 2.0.0.pre.alpha.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: efa9481085cffe170d5ec5a11c1de88313f91b4f9026ccf9ada0689d730172e0
4
- data.tar.gz: c65688b86e6a80648f4a1906533a81ab098782581059081d45bffa576ab54531
3
+ metadata.gz: ada2f96de510e80052f8d09ccb04ffa43718c7582eea853a10f1bcf30b39754f
4
+ data.tar.gz: 7400f8754bbaf45d9718c1855ea1a5b0363fe8c8080dbfef18177e6c4cddd5ae
5
5
  SHA512:
6
- metadata.gz: ae251ee4a4c0deb57e7a0d7b6b662a3f8fe06db40041b6da343c56e256c7018fcf40426d64e0558ac06df96292c67a340e7b8ba3dd7b3883b00b6004f70dbb4c
7
- data.tar.gz: ce46e548e784c0ec824dfff480176be6639458ea2bb42479f61cbbcf2b484de2d5a7aa6470524202a0ad41d592595d87ecec905275fc0cc7b6d24b386cc4c8f2
6
+ metadata.gz: a685d9c330a66bd2907eccd9fba5bdb0fa439994e65a931652c045fc5f0ad8183dd7a9ff7afca0f56ed80523562d18e15527c0f35a96617c19c41952a1a5fba2
7
+ data.tar.gz: b7b9a2fbdb69b84bc24f9af125733b8ee43c3631bdf1dde3407170d1051562dd2ee3de957d5fc07d6ca15a1b53c6548644c23032556ee5d6116be1c1b6a0a4f1
data/README.adoc CHANGED
@@ -108,7 +108,7 @@ Relaton::Ecma::Bibliography.get '1111'
108
108
  [source,ruby]
109
109
  ----
110
110
  item.to_xml
111
- => "<bibitem id="ECMA6" type="standard" schema-version="v1.4.1">
111
+ => "<bibitem id="ECMA6" type="standard" schema-version="v1.5.6">
112
112
  <fetched>2025-12-29</fetched>
113
113
  <title language="en" script="Latn">7-bit coded character set</title>
114
114
  <uri type="src">https://ecma-international.org/publications-and-standards/standards/ecma-6/</uri>
@@ -122,7 +122,7 @@ With `bibdata: true` option XML output wrapped with `bibdata` element and `ext`
122
122
  [source,ruby]
123
123
  ----
124
124
  item.to_xml bibdata: true
125
- => "<bibdata type="standard" schema-version="v1.4.1">
125
+ => "<bibdata type="standard" schema-version="v1.5.6">
126
126
  <fetched>2025-12-29</fetched>
127
127
  <title language="en" script="Latn">7-bit coded character set</title>
128
128
  <uri type="src">https://ecma-international.org/publications-and-standards/standards/ecma-6/</uri>
@@ -187,6 +187,15 @@ Applicable to modify and delete</a:documentation>
187
187
  <a:documentation>Optional caption of this block</a:documentation>
188
188
  </attribute>
189
189
  </optional>
190
+ <optional>
191
+ <attribute name="position">
192
+ <a:documentation>For an "add" change, whether the change is added before or after the location</a:documentation>
193
+ <choice>
194
+ <value>before</value>
195
+ <value>after</value>
196
+ </choice>
197
+ </attribute>
198
+ </optional>
190
199
  <optional>
191
200
  <element name="location">
192
201
  <a:documentation>The location(s) in the original document which have undergone the change described in this block</a:documentation>
@@ -208,11 +217,15 @@ Applicable to modify and delete</a:documentation>
208
217
  </zeroOrMore>
209
218
  <optional>
210
219
  <element name="newcontent">
211
- <a:documentation>New content to be added to the document; applicable to add and modify</a:documentation>
220
+ <a:documentation>New content to be added to the document; applicable to add and modify.
221
+ Can be blocks and/or sections</a:documentation>
212
222
  <ref name="OptionalId"/>
213
223
  <zeroOrMore>
214
224
  <ref name="BasicBlock"/>
215
225
  </zeroOrMore>
226
+ <zeroOrMore>
227
+ <ref name="section"/>
228
+ </zeroOrMore>
216
229
  </element>
217
230
  </optional>
218
231
  <zeroOrMore>
data/grammars/biblio.rng CHANGED
@@ -1142,11 +1142,11 @@ NOTE: This should preferably be encoded as a URI or short identifier, rather th
1142
1142
  <a:documentation>Information about how long the current description of the bibliographic item is valid for</a:documentation>
1143
1143
  </ref>
1144
1144
  </optional>
1145
- <optional>
1145
+ <zeroOrMore>
1146
1146
  <ref name="depiction">
1147
1147
  <a:documentation>Depiction of the bibliographic item, typically an image</a:documentation>
1148
1148
  </ref>
1149
- </optional>
1149
+ </zeroOrMore>
1150
1150
  </define>
1151
1151
  <define name="ReducedBibliographicItem">
1152
1152
  <a:documentation>Reduced description of a bibliographic resource, without mandatory title and docidentifier, used for document relations
@@ -1939,10 +1939,10 @@ Detailed in https://www.relaton.org/model/relations/</a:documentation>
1939
1939
  <value>hasAnnotation</value>
1940
1940
  <value>draftOf</value>
1941
1941
  <value>hasDraft</value>
1942
- <value>preliminaryDraftOf</value>
1943
- <value>hasPreliminaryDraft</value>
1944
- <value>revisionDraftOf</value>
1945
- <value>hasRevisionDraft</value>
1942
+ <value>predecessorDraftOf</value>
1943
+ <value>hasPredecessorDraft</value>
1944
+ <value>successorDraftOf</value>
1945
+ <value>hasSuccessorDraft</value>
1946
1946
  <value>editionOf</value>
1947
1947
  <value>hasEdition</value>
1948
1948
  <value>updates</value>
@@ -2063,13 +2063,13 @@ provided that it is not the entire bibliographic item that is so related</a:docu
2063
2063
  <ref name="LocalizedString"/>
2064
2064
  </element>
2065
2065
  </optional>
2066
- <oneOrMore>
2066
+ <zeroOrMore>
2067
2067
  <element name="taxon">
2068
2068
  <a:documentation>The keywords as a hierarchical taxonomy. For example, the sequence of `taxon` elements
2069
2069
  `pump`, `centrifugal pump`, `line shaft pump` represents a taxonomic classification</a:documentation>
2070
2070
  <ref name="LocalizedString"/>
2071
2071
  </element>
2072
- </oneOrMore>
2072
+ </zeroOrMore>
2073
2073
  <zeroOrMore>
2074
2074
  <ref name="vocabid">
2075
2075
  <a:documentation>Identifiers for the keyword as a controlled vocabulary</a:documentation>
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal:true
2
2
 
3
+ require "mechanize"
4
+
3
5
  module Relaton
4
6
  module Ecma
5
7
  # IETF bibliography module
@@ -4,6 +4,11 @@ require "English"
4
4
  require "mechanize"
5
5
  require "relaton/core"
6
6
  require_relative "../ecma"
7
+ require_relative "parser_common"
8
+ require_relative "page_fetcher"
9
+ require_relative "standard_parser"
10
+ require_relative "memento_parser"
11
+ require_relative "edition_parser"
7
12
  require_relative "data_parser"
8
13
 
9
14
  module Relaton
@@ -16,6 +21,10 @@ module Relaton
16
21
  @index ||= Relaton::Index.find_or_create :ecma, file: "#{INDEXFILE}.yaml"
17
22
  end
18
23
 
24
+ def log_error(msg)
25
+ Util.error msg
26
+ end
27
+
19
28
  def agent
20
29
  @agent ||= Mechanize.new.tap { |a| a.user_agent_alias = Mechanize::AGENT_ALIASES.keys.sample }
21
30
  end
@@ -62,7 +71,7 @@ module Relaton
62
71
 
63
72
  # @param hit [Nokogiri::XML::Element]
64
73
  def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
65
- DataParser.new(hit).parse.each { |item| write_file item }
74
+ DataParser.new(hit, @errors).parse.each { |item| write_file item }
66
75
  end
67
76
 
68
77
  # @param type [String]
@@ -87,6 +96,7 @@ module Relaton
87
96
  def fetch(_ = nil)
88
97
  SOURCES.each { |source| html_index source }
89
98
  index.save
99
+ report_errors
90
100
  end
91
101
  end
92
102
  end
@@ -1,235 +1,49 @@
1
1
  module Relaton
2
2
  module Ecma
3
3
  class DataParser
4
- MATTRS = %i[docidentifier title date source ext].freeze
5
- ATTRS = MATTRS + %i[abstract relation edition ext].freeze
4
+ include ParserCommon
6
5
 
7
6
  #
8
7
  # Initialize parser
9
8
  #
10
9
  # @param [Nokogiri::XML::Element] hit document hit
10
+ # @param [Hash] errors error tracking hash
11
11
  #
12
- def initialize(hit)
12
+ def initialize(hit, errors = {})
13
13
  @hit = hit
14
- @bib = {
15
- type: "standard", language: ["en"], script: ["Latn"], place: [Bib::Place.new(city: "Geneva")]
16
- }
17
- @agent = Mechanize.new
14
+ @errors = errors
18
15
  end
19
16
 
20
- def parse # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
17
+ # @return [Array<Relaton::Ecma::ItemData>]
18
+ def parse
21
19
  if @hit[:href]
22
- @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)]
23
- @doc = get_page @hit[:href]
24
- ATTRS.each { |a| @bib[a] = send "fetch_#{a}" }
20
+ parse_standard
25
21
  else
26
- MATTRS.each { |a| @bib[a] = send "fetch_mem_#{a}" }
22
+ parse_memento
27
23
  end
28
- @bib[:contributor] = contributor
29
- items = [ItemData.new(**@bib)]
30
- items + parse_editions
31
24
  end
32
25
 
33
- #
34
- # Get page with retries
35
- #
36
- # @param [String] url url to fetch
37
- #
38
- # @return [Mechanize::Page] document
39
- #
40
- def get_page(url)
41
- 3.times do |n|
42
- sleep n
43
- doc = @agent.get url
44
- return doc
45
- rescue StandardError => e
46
- Util.error e.message
47
- end
48
- end
49
-
50
- #
51
- # Parse editions
52
- #
53
- # @param [Mechanize::Page] doc document
54
- # @param [Hash] bib bibliographic item the last edition
55
- #
56
- # @return [Array<Relaton::Ecma::ItemData>] editions
57
- #
58
- def parse_editions # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
59
- return [] unless @doc
60
-
61
- docid = @bib[:docid]
62
- @doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit|
63
- id, ed, @bib[:date], vol = edition_id_parts hit.at("./span", "./a").text
64
- @bib[:source] = edition_source(hit) + edition_translation_source(ed)
65
- next if ed.nil? || ed.empty?
66
-
67
- @bib[:docidentifier] = id.nil? || id.empty? ? docid : fetch_docidentifier(id)
68
- @bib[:edition] = Bib::Edition.new(content: ed)
69
- @bib[:extent] = create_extent(vol)
70
- ItemData.new(**@bib)
71
- end.compact
72
- end
73
-
74
- def create_extent(vol)
75
- return unless vol && !vol.empty?
76
-
77
- locality = Bib::Locality.new(type: "volume", reference_from: vol)
78
- [Bib::Extent.new(locality: [locality])]
79
- end
80
-
81
- def edition_source(hit)
82
- { "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a|
83
- Bib::Uri.new(type: type, content: a[:href]) if a
84
- end.compact
85
- end
86
-
87
- #
88
- # Parse edition and date
89
- #
90
- # @param [String] text identifier text
91
- #
92
- # @return [Array<String,nil,Array<Relaton::Bib::Date>>] edition and date
93
- #
94
- def edition_id_parts(text) # rubocop:disable Metrics/MethodLength
95
- %r{^
96
- (?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s
97
- (?:Volume\s(?<vol>[\d.]+),?\s)?
98
- (?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition
99
- (?:[,.]\s(?<dt>\w+\s\d+))?
100
- }x =~ text
101
- date = [dt].compact.map do |d|
102
- on = Date.strptime(d, "%B %Y").strftime("%Y-%m")
103
- Bib::Date.new(type: "published", at: on)
104
- end
105
- [id, ed, date, vol]
106
- end
107
-
108
- # @return [Array<Relaton::Bib::Docidentifier>]
109
- def fetch_docidentifier(id = nil)
110
- id ||= @hit.text
111
- [Bib::Docidentifier.new(type: "ECMA", content: id, primary: true)]
112
- end
113
-
114
- # @return [Array<Relaton::Bib::Uri>]
115
- def fetch_source # rubocop:disable Metrics/AbcSize
116
- source = []
117
- source << Bib::Uri.new(type: "src", content: @hit[:href]) if @hit[:href]
118
- ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a',
119
- '//div[@class="ecma-item-content-wrapper"]/a')
120
- source << Bib::Uri.new(type: "pdf", content: ref[:href]) if ref
121
- source + edition_translation_source(@bib[:edition]&.content)
122
- end
123
-
124
- def fetch_mem_source
125
- @hit.xpath("./div/section/div/p/a").map do |a|
126
- Bib::Uri.new(type: "pdf", content: a[:href])
127
- end
128
- end
129
-
130
- def edition_translation_source(edition)
131
- translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
132
- end
133
-
134
- def translation_source
135
- return [] unless @doc
136
-
137
- @translation_source ||= @doc.xpath("//h2[.='Translations']/following-sibling::ul/li").map do |l|
138
- a = l.at("span/a")
139
- id = l.at("span").text
140
- %r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id
141
- case lang
142
- when "Japanese"
143
- { ed: ed, source: Bib::Uri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) }
144
- end
145
- end.compact
146
- end
147
-
148
- # @return [Array<Relaton::Bib::Title>]
149
- def fetch_title
150
- @doc.xpath('//p[@class="ecma-item-short-description"]').map do |t|
151
- Bib::Title.new(content: t.text.strip, language: "en", script: "Latn")
152
- end
153
- end
154
-
155
- # @return [Array<Relaton::Bib::LocalizedMarkedUpString>]
156
- def fetch_abstract
157
- content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a|
158
- a.text.strip.squeeze(" ").gsub("\r\n", "")
159
- end.join "\n"
160
- return [] if content.empty?
161
-
162
- [Bib::LocalizedMarkedUpString.new(content: content, language: "en", script: "Latn")]
163
- end
164
-
165
- # @return [Array<Relaton::Bib::Date>]
166
- def fetch_date
167
- @doc.xpath('//p[@class="ecma-item-edition"]').map do |d|
168
- date = d.text.split(", ").last
169
- Bib::Date.new type: "published", at: date
170
- end
171
- end
172
-
173
- # @return [Array<Relaton::Bib::Relation>]
174
- def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
175
- @doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel|
176
- ref, ed, date, vol = edition_id_parts rel.at("span").text
177
- next if ed.nil? || ed.empty?
26
+ private
178
27
 
179
- docid = Bib::Docidentifier.new(type: "ECMA", content: ref, primary: true)
180
- source = rel.xpath("span/a").map { |l| Bib::Uri.new type: "pdf", content: l[:href] }
181
- edition = Bib::Edition.new content: ed
182
- extent = create_extent(vol)
183
- bibitem = ItemData.new(
184
- docidentifier: [docid], formattedref: ref, date: date, edition: edition,
185
- source: source, extent: extent
186
- )
187
- Bib::Relation.new(type: "updates", bibitem: bibitem)
188
- end
189
- end
190
-
191
- #
192
- # @return [Relaton::Bib::Edition, nil]
193
- #
194
- def fetch_edition
195
- cnt = @doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s
196
- Bib::Edition.new(content: cnt) if cnt && !cnt.empty?
197
- end
198
-
199
- def contributor
200
- orgname = Bib::TypedLocalizedString.new(content: "Ecma International", language: "en", script: "Latn")
201
- org = Bib::Organization.new name: [orgname]
202
- role = Bib::Contributor::Role.new type: "publisher"
203
- [Bib::Contributor.new(organization: org, role: [role])]
204
- end
205
-
206
- # @return [Array<Relaton::Bib::Docidentifier>]
207
- def fetch_mem_docidentifier
208
- code = "ECMA MEM/#{@hit.at('div[1]//p').text}"
209
- fetch_docidentifier code
210
- end
211
-
212
- def fetch_mem_date
213
- date = @hit.at("div[2]//p").text
214
- on = Date.strptime(date, "%B %Y").strftime "%Y-%m"
215
- [Bib::Date.new(type: "published", at: on)]
216
- end
217
-
218
- def fetch_mem_title
219
- year = @hit.at("div[1]//p").text
220
- content = "\"Memento #{year}\" for year #{year}"
221
- [Bib::Title.new(content: content, language: "en", script: "Latn")]
222
- end
223
-
224
- def fetch_ext
225
- Ext.new(doctype: fetch_doctype, flavor: "ecma")
28
+ def parse_standard
29
+ doc = PageFetcher.new.get(@hit[:href])
30
+ parser = StandardParser.new(hit: @hit, doc: doc, errors: @errors)
31
+ bib = parser.to_bib_hash
32
+ bib[:contributor] = contributor
33
+ items = [ItemData.new(**bib)]
34
+ edition_parser = EditionParser.new(
35
+ doc: doc, bib: bib, errors: @errors,
36
+ translation_source: parser.translation_source
37
+ )
38
+ items + edition_parser.parse
226
39
  end
227
40
 
228
- def fetch_doctype
229
- Bib::Doctype.new content: "document"
41
+ def parse_memento
42
+ parser = MementoParser.new(hit: @hit, errors: @errors)
43
+ bib = parser.to_bib_hash
44
+ bib[:contributor] = contributor
45
+ [ItemData.new(**bib)]
230
46
  end
231
-
232
- alias_method :fetch_mem_ext, :fetch_ext
233
47
  end
234
48
  end
235
49
  end
@@ -0,0 +1,80 @@
1
+ module Relaton
2
+ module Ecma
3
+ class EditionParser
4
+ include ParserCommon
5
+
6
+ # @param [Mechanize::Page] doc document page
7
+ # @param [Hash] bib base bibliographic item attributes
8
+ # @param [Hash] errors error tracking hash
9
+ # @param [Array] translation_source precomputed translation sources
10
+ def initialize(doc:, bib:, errors: {}, translation_source: [])
11
+ @doc = doc
12
+ @bib = bib
13
+ @errors = errors
14
+ @translation_source = translation_source
15
+ end
16
+
17
+ # @return [Array<Relaton::Ecma::ItemData>] editions
18
+ def parse # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
19
+ return [] unless @doc
20
+
21
+ docid = @bib[:docidentifier]
22
+ @doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit|
23
+ bib = @bib.dup
24
+ id, ed, bib[:date], vol = edition_id_parts hit.at("./span", "./a").text
25
+ bib[:source] = edition_source(hit) + edition_translation_source(ed)
26
+ next if ed.nil? || ed.empty?
27
+
28
+ bib[:docidentifier] = id.nil? || id.empty? ? docid : fetch_docidentifier(id)
29
+ @errors[:edition_docidentifier] &&= bib[:docidentifier].empty?
30
+ bib[:edition] = Bib::Edition.new(content: ed)
31
+ bib[:extent] = create_extent(vol)
32
+ @errors[:edition_extent] &&= bib[:extent].nil?
33
+ ItemData.new(**bib)
34
+ end.compact
35
+ end
36
+
37
+ #
38
+ # Parse edition and date
39
+ #
40
+ # @param [String] text identifier text
41
+ #
42
+ # @return [Array<String,nil,Array<Relaton::Bib::Date>>] edition and date
43
+ #
44
+ def edition_id_parts(text) # rubocop:disable Metrics/MethodLength
45
+ %r{^
46
+ (?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s
47
+ (?:Volume\s(?<vol>[\d.]+),?\s)?
48
+ (?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition
49
+ (?:[,.]\s(?<dt>\w+\s\d+))?
50
+ }x =~ text
51
+ date = [dt].compact.map do |d|
52
+ on = Date.strptime(d, "%B %Y").strftime("%Y-%m")
53
+ Bib::Date.new(type: "published", at: on)
54
+ end
55
+ [id, ed, date, vol]
56
+ end
57
+
58
+ def edition_source(hit)
59
+ es = { "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a|
60
+ Bib::Uri.new(type: type, content: a[:href]) if a
61
+ end.compact
62
+ @errors[:edition_source] &&= es.empty?
63
+ es
64
+ end
65
+
66
+ def create_extent(vol)
67
+ return unless vol && !vol.empty?
68
+
69
+ locality = Bib::Locality.new(type: "volume", reference_from: vol)
70
+ [Bib::Extent.new(locality: [locality])]
71
+ end
72
+
73
+ private
74
+
75
+ def edition_translation_source(edition)
76
+ @translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,60 @@
1
+ module Relaton
2
+ module Ecma
3
+ class MementoParser
4
+ include ParserCommon
5
+
6
+ ATTRS = %i[docidentifier title date source ext].freeze
7
+
8
+ # @param [Nokogiri::XML::Element] hit document hit
9
+ # @param [Hash] errors error tracking hash
10
+ def initialize(hit:, errors: {})
11
+ @hit = hit
12
+ @errors = errors
13
+ end
14
+
15
+ # @return [Hash] bibliographic item attributes
16
+ def to_bib_hash
17
+ bib = default_bib_hash
18
+ ATTRS.each { |a| bib[a] = send "fetch_#{a}" }
19
+ bib
20
+ end
21
+
22
+ private
23
+
24
+ # @return [Array<Relaton::Bib::Docidentifier>]
25
+ def fetch_docidentifier
26
+ code = "ECMA MEM/#{@hit.at('div[1]//p').text}"
27
+ docid = super(code)
28
+ @errors[:memento_docidentifier] &&= docid.empty?
29
+ docid
30
+ end
31
+
32
+ # @return [Array<Relaton::Bib::Title>]
33
+ def fetch_title
34
+ year = @hit.at("div[1]//p").text
35
+ content = "\"Memento #{year}\" for year #{year}"
36
+ result = [Bib::Title.new(content: content, language: "en", script: "Latn")]
37
+ @errors[:memento_title] &&= result.empty?
38
+ result
39
+ end
40
+
41
+ # @return [Array<Relaton::Bib::Date>]
42
+ def fetch_date
43
+ date = @hit.at("div[2]//p").text
44
+ on = Date.strptime(date, "%B %Y").strftime "%Y-%m"
45
+ result = [Bib::Date.new(type: "published", at: on)]
46
+ @errors[:memento_date] &&= result.empty?
47
+ result
48
+ end
49
+
50
+ # @return [Array<Relaton::Bib::Uri>]
51
+ def fetch_source
52
+ result = @hit.xpath("./div/section/div/p/a").map do |a|
53
+ Bib::Uri.new(type: "pdf", content: a[:href])
54
+ end
55
+ @errors[:memento_source] &&= result.empty?
56
+ result
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,27 @@
1
+ module Relaton
2
+ module Ecma
3
+ class PageFetcher
4
+ def initialize
5
+ @agent = Mechanize.new
6
+ @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)]
7
+ end
8
+
9
+ #
10
+ # Get page with retries
11
+ #
12
+ # @param [String] url url to fetch
13
+ #
14
+ # @return [Mechanize::Page] document
15
+ #
16
+ def get(url)
17
+ 3.times do |n|
18
+ sleep n
19
+ doc = @agent.get url
20
+ return doc
21
+ rescue StandardError => e
22
+ Util.error e.message
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,33 @@
1
+ module Relaton
2
+ module Ecma
3
+ module ParserCommon
4
+ def default_bib_hash
5
+ {
6
+ type: "standard", language: ["en"], script: ["Latn"], place: [Bib::Place.new(city: "Geneva")]
7
+ }
8
+ end
9
+
10
+ def contributor
11
+ orgname = Bib::TypedLocalizedString.new(content: "Ecma International", language: "en", script: "Latn")
12
+ org = Bib::Organization.new name: [orgname]
13
+ role = Bib::Contributor::Role.new type: "publisher"
14
+ [Bib::Contributor.new(organization: org, role: [role])]
15
+ end
16
+
17
+ # @return [Array<Relaton::Bib::Docidentifier>]
18
+ def fetch_docidentifier(id = nil)
19
+ return [] if id.nil? || id.empty?
20
+
21
+ [Bib::Docidentifier.new(type: "ECMA", content: id, primary: true)]
22
+ end
23
+
24
+ def fetch_ext
25
+ Ext.new(doctype: fetch_doctype, flavor: "ecma")
26
+ end
27
+
28
+ def fetch_doctype
29
+ Bib::Doctype.new content: "document"
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,134 @@
1
+ module Relaton
2
+ module Ecma
3
+ class StandardParser
4
+ include ParserCommon
5
+
6
+ ATTRS = %i[docidentifier title date source abstract relation edition ext].freeze
7
+
8
+ # @param [Nokogiri::XML::Element] hit document hit
9
+ # @param [Mechanize::Page] doc fetched document page
10
+ # @param [Hash] errors error tracking hash
11
+ def initialize(hit:, doc:, errors: {})
12
+ @hit = hit
13
+ @doc = doc
14
+ @errors = errors
15
+ end
16
+
17
+ # @return [Hash] bibliographic item attributes
18
+ def to_bib_hash
19
+ bib = default_bib_hash
20
+ ATTRS.each { |a| bib[a] = send "fetch_#{a}" }
21
+ bib
22
+ end
23
+
24
+ # @return [Array] precomputed translation sources
25
+ def translation_source
26
+ @translation_source ||= parse_translation_source
27
+ end
28
+
29
+ # @return [Array<Relaton::Bib::Docidentifier>]
30
+ def fetch_docidentifier
31
+ result = super(@hit.text)
32
+ @errors[:standard_docidentifier] &&= result.empty?
33
+ result
34
+ end
35
+
36
+ # @return [Array<Relaton::Bib::Title>]
37
+ def fetch_title
38
+ result = @doc.xpath('//p[@class="ecma-item-short-description"]').map do |t|
39
+ Bib::Title.new(content: t.text.strip, language: "en", script: "Latn")
40
+ end
41
+ @errors[:standard_title] &&= result.empty?
42
+ result
43
+ end
44
+
45
+ # @return [Array<Relaton::Bib::LocalizedMarkedUpString>]
46
+ def fetch_abstract
47
+ content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a|
48
+ a.text.strip.squeeze(" ").gsub("\r\n", "")
49
+ end.join "\n"
50
+ return [] if content.empty?
51
+
52
+ result = [Bib::Abstract.new(content: content, language: "en", script: "Latn")]
53
+ @errors[:standard_abstract] &&= result.empty?
54
+ result
55
+ end
56
+
57
+ # @return [Array<Relaton::Bib::Date>]
58
+ def fetch_date
59
+ result = @doc.xpath('//p[@class="ecma-item-edition"]').map do |d|
60
+ date = d.text.split(", ").last
61
+ Bib::Date.new type: "published", at: date
62
+ end
63
+ @errors[:standard_date] &&= result.empty?
64
+ result
65
+ end
66
+
67
+ # @return [Array<Relaton::Bib::Uri>]
68
+ def fetch_source # rubocop:disable Metrics/AbcSize
69
+ source = []
70
+ source << Bib::Uri.new(type: "src", content: @hit[:href]) if @hit[:href]
71
+ ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a',
72
+ '//div[@class="ecma-item-content-wrapper"]/a')
73
+ source << Bib::Uri.new(type: "pdf", content: ref[:href]) if ref
74
+ result = source + edition_translation_source(fetch_edition_content)
75
+ @errors[:standard_source] &&= result.empty?
76
+ result
77
+ end
78
+
79
+ # @return [Array<Relaton::Bib::Relation>]
80
+ def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
81
+ edition_parser = EditionParser.new(doc: @doc, bib: {}, errors: @errors)
82
+ result = @doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel|
83
+ ref, ed, date, vol = edition_parser.edition_id_parts rel.at("span").text
84
+ next if ed.nil? || ed.empty?
85
+
86
+ docid = Bib::Docidentifier.new(type: "ECMA", content: ref, primary: true)
87
+ source = rel.xpath("span/a").map { |l| Bib::Uri.new type: "pdf", content: l[:href] }
88
+ edition = Bib::Edition.new content: ed
89
+ extent = edition_parser.create_extent(vol)
90
+ @errors[:standard_relation_extent] &&= extent.nil?
91
+ bibitem = ItemData.new(
92
+ docidentifier: [docid], formattedref: Bib::Formattedref.new(content: ref), date: date, edition: edition,
93
+ source: source, extent: extent
94
+ )
95
+ Bib::Relation.new(type: "updates", bibitem: bibitem)
96
+ end
97
+ @errors[:standard_relation] &&= result.empty?
98
+ result
99
+ end
100
+
101
+ # @return [Relaton::Bib::Edition, nil]
102
+ def fetch_edition
103
+ cnt = fetch_edition_content
104
+ result = Bib::Edition.new(content: cnt) if cnt && !cnt.empty?
105
+ @errors[:standard_edition] &&= result.nil?
106
+ result
107
+ end
108
+
109
+ private
110
+
111
+ def fetch_edition_content
112
+ @doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s
113
+ end
114
+
115
+ def edition_translation_source(edition)
116
+ translation_source.select { |s| s[:ed] == edition }.map { |s| s[:source] }
117
+ end
118
+
119
+ def parse_translation_source
120
+ return [] unless @doc
121
+
122
+ @doc.xpath("//h2[.='Translations']/following-sibling::ul/li").map do |l|
123
+ a = l.at("span/a")
124
+ id = l.at("span").text
125
+ %r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id
126
+ case lang
127
+ when "Japanese"
128
+ { ed: ed, source: Bib::Uri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) }
129
+ end
130
+ end.compact
131
+ end
132
+ end
133
+ end
134
+ end
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module Ecma
3
- VERSION = "2.0.0-alpha.2".freeze
3
+ VERSION = "2.0.0-alpha.3".freeze
4
4
  end
5
5
  end
data/relaton_ecma.gemspec CHANGED
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
28
28
  spec.require_paths = ["lib"]
29
29
 
30
30
  spec.add_dependency "mechanize", "~> 2.10"
31
- spec.add_dependency "relaton-bib", "~> 2.0.0-alpha.4"
32
- spec.add_dependency "relaton-core", "~> 0.0.9"
31
+ spec.add_dependency "relaton-bib", "~> 2.0.0-alpha.7"
32
+ spec.add_dependency "relaton-core", "~> 0.0.12"
33
33
  spec.add_dependency "relaton-index", "~> 0.2.4"
34
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-ecma
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0.pre.alpha.2
4
+ version: 2.0.0.pre.alpha.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -29,28 +29,28 @@ dependencies:
29
29
  requirements:
30
30
  - - "~>"
31
31
  - !ruby/object:Gem::Version
32
- version: 2.0.0.pre.alpha.4
32
+ version: 2.0.0.pre.alpha.7
33
33
  type: :runtime
34
34
  prerelease: false
35
35
  version_requirements: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: 2.0.0.pre.alpha.4
39
+ version: 2.0.0.pre.alpha.7
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: relaton-core
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: 0.0.9
46
+ version: 0.0.12
47
47
  type: :runtime
48
48
  prerelease: false
49
49
  version_requirements: !ruby/object:Gem::Requirement
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: 0.0.9
53
+ version: 0.0.12
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: relaton-index
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -96,10 +96,15 @@ files:
96
96
  - lib/relaton/ecma/bibliography.rb
97
97
  - lib/relaton/ecma/data_fetcher.rb
98
98
  - lib/relaton/ecma/data_parser.rb
99
+ - lib/relaton/ecma/edition_parser.rb
99
100
  - lib/relaton/ecma/ext.rb
100
101
  - lib/relaton/ecma/item.rb
101
102
  - lib/relaton/ecma/item_data.rb
103
+ - lib/relaton/ecma/memento_parser.rb
104
+ - lib/relaton/ecma/page_fetcher.rb
105
+ - lib/relaton/ecma/parser_common.rb
102
106
  - lib/relaton/ecma/processor.rb
107
+ - lib/relaton/ecma/standard_parser.rb
103
108
  - lib/relaton/ecma/util.rb
104
109
  - lib/relaton/ecma/version.rb
105
110
  - relaton_ecma.gemspec