relaton-un 0.2.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -107,7 +107,9 @@
107
107
  <zeroOrMore>
108
108
  <ref name="submissionlanguage"/>
109
109
  </zeroOrMore>
110
- <ref name="editorialgroup"/>
110
+ <optional>
111
+ <ref name="editorialgroup"/>
112
+ </optional>
111
113
  <zeroOrMore>
112
114
  <ref name="ics"/>
113
115
  </zeroOrMore>
@@ -117,6 +119,9 @@
117
119
  <optional>
118
120
  <ref name="session"/>
119
121
  </optional>
122
+ <optional>
123
+ <ref name="job_number"/>
124
+ </optional>
120
125
  </define>
121
126
  <define name="preface">
122
127
  <element name="preface">
@@ -136,6 +141,109 @@
136
141
  <ref name="Basic-Section"/>
137
142
  </element>
138
143
  </define>
144
+ <define name="Clause-Section">
145
+ <optional>
146
+ <attribute name="id">
147
+ <data type="ID"/>
148
+ </attribute>
149
+ </optional>
150
+ <optional>
151
+ <attribute name="language"/>
152
+ </optional>
153
+ <optional>
154
+ <attribute name="script"/>
155
+ </optional>
156
+ <optional>
157
+ <attribute name="inline-header">
158
+ <data type="boolean"/>
159
+ </attribute>
160
+ </optional>
161
+ <optional>
162
+ <attribute name="obligation">
163
+ <choice>
164
+ <value>normative</value>
165
+ <value>informative</value>
166
+ </choice>
167
+ </attribute>
168
+ </optional>
169
+ <optional>
170
+ <attribute name="unnumbered">
171
+ <data type="boolean"/>
172
+ </attribute>
173
+ </optional>
174
+ <optional>
175
+ <ref name="section-title"/>
176
+ </optional>
177
+ <group>
178
+ <group>
179
+ <zeroOrMore>
180
+ <ref name="BasicBlock"/>
181
+ </zeroOrMore>
182
+ <zeroOrMore>
183
+ <ref name="note"/>
184
+ </zeroOrMore>
185
+ </group>
186
+ <zeroOrMore>
187
+ <choice>
188
+ <ref name="clause-subsection"/>
189
+ <ref name="terms"/>
190
+ <ref name="definitions"/>
191
+ </choice>
192
+ </zeroOrMore>
193
+ </group>
194
+ </define>
195
+ <define name="Annex-Section">
196
+ <optional>
197
+ <attribute name="id">
198
+ <data type="ID"/>
199
+ </attribute>
200
+ </optional>
201
+ <optional>
202
+ <attribute name="language"/>
203
+ </optional>
204
+ <optional>
205
+ <attribute name="script"/>
206
+ </optional>
207
+ <optional>
208
+ <attribute name="inline-header">
209
+ <data type="boolean"/>
210
+ </attribute>
211
+ </optional>
212
+ <optional>
213
+ <attribute name="obligation">
214
+ <choice>
215
+ <value>normative</value>
216
+ <value>informative</value>
217
+ </choice>
218
+ </attribute>
219
+ </optional>
220
+ <optional>
221
+ <attribute name="unnumbered">
222
+ <data type="boolean"/>
223
+ </attribute>
224
+ </optional>
225
+ <optional>
226
+ <ref name="section-title"/>
227
+ </optional>
228
+ <group>
229
+ <group>
230
+ <zeroOrMore>
231
+ <ref name="BasicBlock"/>
232
+ </zeroOrMore>
233
+ <zeroOrMore>
234
+ <ref name="note"/>
235
+ </zeroOrMore>
236
+ </group>
237
+ <zeroOrMore>
238
+ <choice>
239
+ <ref name="annex-subsection"/>
240
+ <ref name="terms"/>
241
+ <ref name="definitions"/>
242
+ <ref name="references"/>
243
+ </choice>
244
+ </zeroOrMore>
245
+ </group>
246
+ </define>
139
247
  </include>
140
248
  <define name="session">
141
249
  <element name="session">
@@ -221,9 +329,15 @@
221
329
  <value>general</value>
222
330
  <value>limited</value>
223
331
  <value>restricted</value>
332
+ <value>provisional</value>
224
333
  </choice>
225
334
  </element>
226
335
  </define>
336
+ <define name="job_number">
337
+ <element name="job_number">
338
+ <text/>
339
+ </element>
340
+ </define>
227
341
  <define name="un-standard">
228
342
  <element name="un-standard">
229
343
  <ref name="bibdata"/>
@@ -6,6 +6,8 @@ require "relaton_un/hit_collection"
6
6
  require "relaton_un/hit"
7
7
  require "relaton_un/hash_converter"
8
8
  require "relaton_un/xml_parser"
9
+ require "relaton_un/session"
10
+ require "relaton_un/editorialgroup"
9
11
 
10
12
  module RelatonUn
11
13
  class Error < StandardError; end
@@ -0,0 +1,25 @@
1
+ module RelatonUn
2
+ class EditorialGroup
3
+ include RelatonBib
4
+
5
+ # @return [Array<String>]
6
+ attr_reader :committee
7
+
8
+ # @param committee [Array<String>]
9
+ def initialize(committee)
10
+ @committee = committee
11
+ end
12
+
13
+ # @param builder [Nokogiri::XML::Builder]
14
+ def to_xml(builder)
15
+ builder.editorialgroup do |b|
16
+ committee.each { |c| b.committee c }
17
+ end
18
+ end
19
+
20
+ # @return [Array<Hash>, Hash]
21
+ def to_hash
22
+ single_element_array(committee.map { |c| { "committee" => c } })
23
+ end
24
+ end
25
+ end
@@ -1,5 +1,33 @@
1
1
  module RelatonUn
2
2
  class HashConverter < RelatonBib::HashConverter
3
+ class << self
4
+ # @override RelatonIsoBib::HashConverter.hash_to_bib
5
+ # @param args [Hash]
6
+ # @param nested [TrueClass, FalseClass]
7
+ # @return [Hash]
8
+ def hash_to_bib(args, nested = false)
9
+ ret = super
10
+ return if ret.nil?
3
11
 
12
+ session_hash_to_bib ret
13
+ ret
14
+ end
15
+
16
+ private
17
+
18
+ # @param ret [Hash]
19
+ def session_hash_to_bib(ret)
20
+ ret[:session] = Session.new(ret[:session]) if ret[:session]
21
+ end
22
+
23
+ # @param ret [Hash]
24
+ def editorialgroup_hash_to_bib(ret)
25
+ eg = ret[:editorialgroup]
26
+ return unless eg
27
+
28
+ committee = eg.map { |e| e[:committee] }
29
+ ret[:editorialgroup] = EditorialGroup.new array(committee)
30
+ end
31
+ end
4
32
  end
5
33
  end
@@ -3,6 +3,51 @@
3
3
  module RelatonUn
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
+ # rubocop:disable Layout/LineLength
7
+
8
+ # There is distribution PRO (A/47/PV.102/CORR.1, A/47/PV.54)
9
+ BODY = {
10
+ "A" => "General Assembly",
11
+ "E" => "Economic and Social Council",
12
+ "S" => "Security Council",
13
+ "T" => "Trusteeship Council",
14
+ "ACC" => "Administrative Committee on Coordination",
15
+ "AT" => "United Nations Administrative Tribunal",
16
+ "CAT" => "Committee against Torture",
17
+ "CCPR" => "Human Rights Committee",
18
+ "CD" => "Conference on Disarmament",
19
+ "CEDAW" => "Committee on the Elimination of All Forms of Discrimination against Women",
20
+ "CERD" => "Committee on the Elimination of Racial Discrimination",
21
+ "CRC" => "Committee on the Rights of the Child",
22
+ "DC" => "Disarmament Commission",
23
+ "DP" => "United Nations Development Programme",
24
+ "HS" => "United Nations Centre for Human Settlements (HABITAT)",
25
+ "TD" => "United Nations Conference on Trade and Development",
26
+ "UNEP" => "United Nations Environment Programme",
27
+ "TRADE" => "Committee on Trade",
28
+ "CEFACT" => "Centre for Trade Facilitation and Electronic Business",
29
+ "C.1" => "Disarmament and International Security Committee",
30
+ "C.2" => "Economic and Financial Committee",
31
+ "C.3" => "Social, Humanitarian & Cultural Issues",
32
+ "C.4" => "Special Political and Decolonization Committee",
33
+ "C.5" => "Administrative and Budgetary Committee",
34
+ "C.6" => "Sixth Committee (Legal)",
35
+ "PC" => "Preparatory Committee",
36
+ "AEC" => "Atomic Energy Commission",
37
+ "AGRI" => "Committee on Agriculture",
38
+ "AMCEN" => "African Ministerial Conference on the Environment",
39
+ "AMCOW" => "African Ministers’ Council on Water",
40
+ "ECA" => "Economic Commission for Africa",
41
+ "ESCAP" => "Economic and Social Commission for Asia and Pacific",
42
+ "ECE" => "Economic Commission for Europe",
43
+ "ECWA" => "Economic Commission for Western Asia",
44
+ "UNFF" => "United Nations Forum on Forests",
45
+ "ENERGY" => "Committee on Sustainable Energy",
46
+ "FAO" => "Food and Agriculture Organization",
47
+ "UNCTAD" => "United Nations Conference on Trade and Development",
48
+ }.freeze
49
+ # rubocop:enable Layout/LineLength
50
+
6
51
  # Parse page.
7
52
  # @return [RelatonUn::UnBibliographicItem]
8
53
  def fetch
@@ -11,48 +56,97 @@ module RelatonUn
11
56
 
12
57
  private
13
58
 
59
+ # rubocop:disable Metrics/MethodLength
60
+
61
+ # @return [RelatonUn::UnBibliographicItem]
14
62
  def un_bib_item
15
63
  UnBibliographicItem.new(
16
64
  type: "standard",
17
65
  fetched: Date.today.to_s,
18
- docid: docid,
66
+ docid: fetch_docid,
19
67
  docnumber: hit[:ref],
20
68
  language: ["en"],
21
69
  script: ["Latn"],
22
- title: title,
23
- date: date,
24
- link: link,
25
- keyword: keyword
70
+ title: fetch_title,
71
+ date: fetch_date,
72
+ link: fetch_link,
73
+ keyword: fetch_keyword,
74
+ session: fetch_session,
75
+ distribution: fetch_distribution,
76
+ editorialgroup: fetch_editorialgroup,
77
+ classification: fetch_classification,
26
78
  )
27
79
  end
80
+ # rubocop:enable Metrics/MethodLength
28
81
 
29
82
  # @return [Array<RelatonBib::DocumentIdentifier>]
30
- def docid
31
- [RelatonBib::DocumentIdentifier.new(id: hit[:ref], type: "UN")]
83
+ def fetch_docid
84
+ hit[:symbol].map do |s|
85
+ RelatonBib::DocumentIdentifier.new(id: s, type: "UN")
86
+ end
32
87
  end
33
88
 
34
89
  # @return [Array<RelatonBib::TypedTitleString>]
35
- def title
36
- fs = RelatonBib::FormattedString.new(content: hit[:title], language: "en", script: "Latn")
37
- [RelatonBib::TypedTitleString.new(type: "main", title: fs)]
90
+ def fetch_title
91
+ # fs = RelatonBib::FormattedString.new(
92
+ # content: hit[:title], language: "en", script: "Latn",
93
+ # )
94
+ # [RelatonBib::TypedTitleString.new(type: "main", title: fs)]
95
+ # [{ title_main: hit[:title], language: "en", script: "Latn" }]
96
+ RelatonBib::TypedTitleString.from_string hit[:title], "en", "Latn"
38
97
  end
39
98
 
40
99
  # @return [Array<RelatonBib::BibliographicDate>]
41
- def date
100
+ def fetch_date
42
101
  d = []
43
- d << RelatonBib::BibliographicDate.new(type: "published", on: hit[:date_pub]) if hit[:date_pub]
44
- d << RelatonBib::BibliographicDate.new(type: "issued", on: hit[:date_rel]) if hit[:date_rel]
102
+ d << bibdate("published", hit[:date_pub]) if hit[:date_pub]
103
+ d << bibdate("issued", hit[:date_rel]) if hit[:date_rel]
45
104
  d
46
105
  end
47
106
 
107
+ # @param type [String]
108
+ # @param on [String]
109
+ # @return [RelatonBib::BibliographicDate]
110
+ def bibdate(type, on)
111
+ RelatonBib::BibliographicDate.new type: type, on: on
112
+ end
113
+
48
114
  # @return [Array<RelatonBib::TypedUri>]
49
- def link
115
+ def fetch_link
50
116
  hit[:link].map { |l| RelatonBib::TypedUri.new l }
51
117
  end
52
118
 
53
119
  # @return [Array<String>]
54
- def keyword
120
+ def fetch_keyword
55
121
  hit[:keyword].split(", ")
56
122
  end
123
+
124
+ # @return [RelatonUn::Session]
125
+ def fetch_session
126
+ Session.new(session_number: hit[:session], agenda_id: hit[:agenda])
127
+ end
128
+
129
+ # @return [String]
130
+ def fetch_distribution
131
+ UnBibliographicItem::DISTRIBUTIONS[hit[:distribution]]
132
+ end
133
+
134
+ # @return [RelatonUn::EditorialGroup, NilClass]
135
+ def fetch_editorialgroup
136
+ tc = hit[:ref].match(/^[\S]+/).to_s.split(/\/|-/).reduce([]) do |m, v|
137
+ if BODY[v] then m << BODY[v]
138
+ elsif v =~ /(AC|C|CN|CONF|GC|SC|Sub|WG).\d+|PC/ then m << v
139
+ else m
140
+ end
141
+ end.uniq
142
+ return unless tc.any?
143
+
144
+ RelatonUn::EditorialGroup.new tc
145
+ end
146
+
147
+ # @return [Array<RelatonBib::Classification>]
148
+ def fetch_classification
149
+ [RelatonBib::Classification.new(type: "area", value: "UNDOC")]
150
+ end
57
151
  end
58
152
  end
@@ -6,7 +6,8 @@ require "http-cookie"
6
6
  module RelatonUn
7
7
  # Page of hit collection.
8
8
  class HitCollection < RelatonBib::HitCollection
9
- AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
9
+ AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) "\
10
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
10
11
  DOMAIN = "https://documents.un.org"
11
12
  BOUNDARY = "----WebKitFormBoundary6hkaBvITDck8dHCn"
12
13
 
@@ -17,52 +18,10 @@ module RelatonUn
17
18
  @jar = HTTP::CookieJar.new
18
19
  @http = Net::HTTP.new @uri.host, @uri.port
19
20
  @http.use_ssl = true
21
+ @http.read_timeout = 120
20
22
  if (form_resp = get_page)
21
- form = Nokogiri::HTML form_resp.body
22
- form_data = form.xpath(
23
- "//input[@type!='radio']",
24
- "//input[@type='radio'][@checked]",
25
- "//select[@name!='view:_id1:_id2:cbLang']",
26
- "//textarea"
27
- ).reduce([]) do |m, i|
28
- v = case i[:name]
29
- when "view:_id1:_id2:txtSymbol" then text
30
- when "view:_id1:_id2:cbType" then "FP"
31
- when "view:_id1:_id2:cbSort" then "R"
32
- when "$$xspsubmitid" then "view:_id1:_id2:_id130"
33
- when "$$xspsubmitscroll" then "0|167"
34
- else i[:value]
35
- end
36
- m << %{--#{BOUNDARY}}
37
- m << %{Content-Disposition: form-data; name="#{i[:name]}"\r\n\r\n#{v}}
38
- end
39
- form_data << %{--#{BOUNDARY}--\r\n}
40
- req = Net::HTTP::Post.new form.at("//form")[:action]
41
- set_headers req
42
- req["Content-Type"] = "multipart/form-data, boundary=#{BOUNDARY}"
43
- req.body = form_data.join("\r\n")
44
- resp = @http.request req
45
- page_resp = get_page URI.parse(resp["location"]).request_uri
46
- doc = Nokogiri::HTML page_resp.body
47
- @array = doc.css("div.viewHover").map do |item|
48
- ref = item.at("div/div/a")&.text&.sub "\u00A0", ""
49
- title = item.at("div/div/span")&.text
50
- keyword = item.at("div[3]/div[5]/span")&.text
51
- date_pub = item.at("//label[.='Publication Date: ']/following-sibling::span")&.text
52
- en = item.at("//span[.='ENGLISH']/../..")
53
- date_rel = en.at("./following-sibling::span[contains(@id, 'cfRelDateE')]").text
54
- link = en.xpath("//a[contains(@title, 'Open')]").map do |l|
55
- { content: l[:href], type: l[:title].match(/PDF|Word/).to_s.downcase }
56
- end
57
- Hit.new({
58
- ref: ref,
59
- title: title,
60
- keyword: keyword,
61
- date_pub: date_pub,
62
- date_rel: date_rel,
63
- link: link
64
- }, self)
65
- end
23
+ doc = Nokogiri::HTML page_resp(form_resp, text).body
24
+ @array = doc.css("div.viewHover").map { |item| hit item }
66
25
  end
67
26
  end
68
27
 
@@ -84,9 +43,125 @@ module RelatonUn
84
43
  get_page request_uri, deep + 1
85
44
  end
86
45
 
46
+ # rubocop:disable Metrics/MethodLength
47
+
48
+ # @param form [Nokogiri::HTML::Document]
49
+ # @param text [String]
50
+ # @return [Array<String>]
51
+ def form_data(form, text)
52
+ fd = form.xpath(
53
+ "//input[@type!='radio']",
54
+ "//input[@type='radio'][@checked]",
55
+ "//select[@name!='view:_id1:_id2:cbLang']",
56
+ "//textarea",
57
+ ).reduce([]) do |m, i|
58
+ v = case i[:name]
59
+ when "view:_id1:_id2:txtSymbol" then text
60
+ when "view:_id1:_id2:cbType" then "FP"
61
+ when "view:_id1:_id2:cbSort" then "R"
62
+ when "$$xspsubmitid" then "view:_id1:_id2:_id130"
63
+ when "$$xspsubmitscroll" then "0|167"
64
+ else i[:value]
65
+ end
66
+ m << %{--#{BOUNDARY}}
67
+ m << %{Content-Disposition: form-data; name="#{i[:name]}"\r\n\r\n#{v}}
68
+ end
69
+ fd << %{--#{BOUNDARY}--\r\n}
70
+ end
71
+ # rubocop:enable Metrics/MethodLength
72
+
73
+ # @param form_resp [Net::HTTPOK]
74
+ # @param text [String]
75
+ # @return [Net::HTTPOK]
76
+ def page_resp(form_resp, text)
77
+ form = Nokogiri::HTML form_resp.body
78
+ req = Net::HTTP::Post.new form.at("//form")[:action]
79
+ set_headers req
80
+ req["Content-Type"] = "multipart/form-data, boundary=#{BOUNDARY}"
81
+ req.body = form_data(form, text).join("\r\n")
82
+ resp = @http.request req
83
+ get_page URI.parse(resp["location"]).request_uri
84
+ end
85
+
86
+ # @param item [Nokogiri::XML::Element]
87
+ # @return [RelatonUn::Hit]
88
+ def hit(item)
89
+ Hit.new(hit_data(item), self)
90
+ end
91
+
92
+ # @param item [Nokogiri::XML::Element]
93
+ # @return [Hash]
94
+ def hit_data(item)
95
+ en = item.at("//span[.='ENGLISH']/../..")
96
+ {
97
+ ref: item.at("div/div/a")&.text&.sub("\u00A0", ""),
98
+ symbol: symbol(item),
99
+ title: item.at("div/div/span")&.text,
100
+ keyword: item.at("div[3]/div[5]/span")&.text,
101
+ date_pub: date_pub(item),
102
+ date_rel: date_rel(en),
103
+ link: link(en),
104
+ session: session(item),
105
+ agenda: agenda(item),
106
+ distribution: distribution(item)
107
+ }
108
+ end
109
+
110
+ # @param item [Nokogiri::XML::Element]
111
+ # @return [String]
112
+ def symbol(item)
113
+ item.xpath("div/div[not(contains(@class, 'hidden'))]/"\
114
+ "label[contains(.,'Symbol')]/following-sibling::span[1]").map &:text
115
+ end
116
+
117
+ # @param item [Nokogiri::XML::Element]
118
+ # @return [String]
119
+ def date_pub(item)
120
+ item.at("//label[.='Publication Date: ']/following-sibling::span")&.text
121
+ end
122
+
123
+ # @param item [Nokogiri::XML::Element]
124
+ # @return [String]
125
+ def date_rel(item)
126
+ item.at("./following-sibling::span[contains(@id, 'cfRelDateE')]")&.text
127
+ end
128
+
129
+ # @param item [Nokogiri::XML::Element]
130
+ # @return [Array<Hash>]
131
+ def link(item)
132
+ item.xpath("//a[contains(@title, 'Open')]").map do |l|
133
+ {
134
+ content: l[:href],
135
+ type: l[:title].match(/PDF|Word/).to_s.downcase,
136
+ }
137
+ end
138
+ end
139
+
140
+ # @param item [Nokogiri::XML::Element]
141
+ # @return [String]
142
+ def session(item)
143
+ item.at("//label[.='Session / Year:']/following-sibling::span")&.text
144
+ end
145
+
146
+ # @param item [Nokogiri::XML::Element]
147
+ # @return [String]
148
+ def agenda(item)
149
+ item.at("//label[.='Agenda Item(s):']/following-sibling::span")&.text
150
+ end
151
+
152
+ # @param item [Nokogiri::XML::Element]
153
+ # @return [String]
154
+ def distribution(item)
155
+ item.at("//label[.='Distribution:']/following-sibling::span")&.text
156
+ end
157
+
158
+ # rubocop:disable Metrics/MethodLength
159
+
160
+ # @param req [Net::HTTP::Get, Net::HTTP::Post]
87
161
  def set_headers(req)
88
162
  set_cookie req
89
- req["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
163
+ req["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,"\
164
+ "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
90
165
  req["Accept-Encoding"] = "gzip, deflate, br"
91
166
  req["Cache-Control"] = "max-age=0"
92
167
  req["Connection"] = "keep-alive"
@@ -98,7 +173,9 @@ module RelatonUn
98
173
  req["Upgrade-Insecure-Requests"] = "1"
99
174
  req["User-Agent"] = AGENT
100
175
  end
176
+ # rubocop:enable Metrics/MethodLength
101
177
 
178
+ # @param req [Net::HTTP::Get, Net::HTTP::Post]
102
179
  def set_cookie(req)
103
180
  req["Cookie"] = HTTP::Cookie.cookie_value @jar.cookies(@uri)
104
181
  end