relaton-un 0.2.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -107,7 +107,9 @@
107
107
  <zeroOrMore>
108
108
  <ref name="submissionlanguage"/>
109
109
  </zeroOrMore>
110
- <ref name="editorialgroup"/>
110
+ <optional>
111
+ <ref name="editorialgroup"/>
112
+ </optional>
111
113
  <zeroOrMore>
112
114
  <ref name="ics"/>
113
115
  </zeroOrMore>
@@ -117,6 +119,9 @@
117
119
  <optional>
118
120
  <ref name="session"/>
119
121
  </optional>
122
+ <optional>
123
+ <ref name="job_number"/>
124
+ </optional>
120
125
  </define>
121
126
  <define name="preface">
122
127
  <element name="preface">
@@ -136,6 +141,109 @@
136
141
  <ref name="Basic-Section"/>
137
142
  </element>
138
143
  </define>
144
+ <define name="Clause-Section">
145
+ <optional>
146
+ <attribute name="id">
147
+ <data type="ID"/>
148
+ </attribute>
149
+ </optional>
150
+ <optional>
151
+ <attribute name="language"/>
152
+ </optional>
153
+ <optional>
154
+ <attribute name="script"/>
155
+ </optional>
156
+ <optional>
157
+ <attribute name="inline-header">
158
+ <data type="boolean"/>
159
+ </attribute>
160
+ </optional>
161
+ <optional>
162
+ <attribute name="obligation">
163
+ <choice>
164
+ <value>normative</value>
165
+ <value>informative</value>
166
+ </choice>
167
+ </attribute>
168
+ </optional>
169
+ <optional>
170
+ <attribute name="unnumbered">
171
+ <data type="boolean"/>
172
+ </attribute>
173
+ </optional>
174
+ <optional>
175
+ <ref name="section-title"/>
176
+ </optional>
177
+ <group>
178
+ <group>
179
+ <zeroOrMore>
180
+ <ref name="BasicBlock"/>
181
+ </zeroOrMore>
182
+ <zeroOrMore>
183
+ <ref name="note"/>
184
+ </zeroOrMore>
185
+ </group>
186
+ <zeroOrMore>
187
+ <choice>
188
+ <ref name="clause-subsection"/>
189
+ <ref name="terms"/>
190
+ <ref name="definitions"/>
191
+ </choice>
192
+ </zeroOrMore>
193
+ </group>
194
+ </define>
195
+ <define name="Annex-Section">
196
+ <optional>
197
+ <attribute name="id">
198
+ <data type="ID"/>
199
+ </attribute>
200
+ </optional>
201
+ <optional>
202
+ <attribute name="language"/>
203
+ </optional>
204
+ <optional>
205
+ <attribute name="script"/>
206
+ </optional>
207
+ <optional>
208
+ <attribute name="inline-header">
209
+ <data type="boolean"/>
210
+ </attribute>
211
+ </optional>
212
+ <optional>
213
+ <attribute name="obligation">
214
+ <choice>
215
+ <value>normative</value>
216
+ <value>informative</value>
217
+ </choice>
218
+ </attribute>
219
+ </optional>
220
+ <optional>
221
+ <attribute name="unnumbered">
222
+ <data type="boolean"/>
223
+ </attribute>
224
+ </optional>
225
+ <optional>
226
+ <ref name="section-title"/>
227
+ </optional>
228
+ <group>
229
+ <group>
230
+ <zeroOrMore>
231
+ <ref name="BasicBlock"/>
232
+ </zeroOrMore>
233
+ <zeroOrMore>
234
+ <ref name="note"/>
235
+ </zeroOrMore>
236
+ </group>
237
+ <zeroOrMore>
238
+ <choice>
239
+ <ref name="annex-subsection"/>
240
+ <ref name="terms"/>
241
+ <ref name="definitions"/>
242
+ <ref name="references"/>
243
+ </choice>
244
+ </zeroOrMore>
245
+ </group>
246
+ </define>
139
247
  </include>
140
248
  <define name="session">
141
249
  <element name="session">
@@ -221,9 +329,15 @@
221
329
  <value>general</value>
222
330
  <value>limited</value>
223
331
  <value>restricted</value>
332
+ <value>provisional</value>
224
333
  </choice>
225
334
  </element>
226
335
  </define>
336
+ <define name="job_number">
337
+ <element name="job_number">
338
+ <text/>
339
+ </element>
340
+ </define>
227
341
  <define name="un-standard">
228
342
  <element name="un-standard">
229
343
  <ref name="bibdata"/>
@@ -6,6 +6,8 @@ require "relaton_un/hit_collection"
6
6
  require "relaton_un/hit"
7
7
  require "relaton_un/hash_converter"
8
8
  require "relaton_un/xml_parser"
9
+ require "relaton_un/session"
10
+ require "relaton_un/editorialgroup"
9
11
 
10
12
  module RelatonUn
11
13
  class Error < StandardError; end
@@ -0,0 +1,25 @@
1
+ module RelatonUn
2
+ class EditorialGroup
3
+ include RelatonBib
4
+
5
+ # @return [Array<String>]
6
+ attr_reader :committee
7
+
8
+ # @param committee [Array<String>]
9
+ def initialize(committee)
10
+ @committee = committee
11
+ end
12
+
13
+ # @param builder [Nokogiri::XML::Builder]
14
+ def to_xml(builder)
15
+ builder.editorialgroup do |b|
16
+ committee.each { |c| b.committee c }
17
+ end
18
+ end
19
+
20
+ # @return [Array<Hash>, Hash]
21
+ def to_hash
22
+ single_element_array(committee.map { |c| { "committee" => c } })
23
+ end
24
+ end
25
+ end
@@ -1,5 +1,33 @@
1
1
  module RelatonUn
2
2
  class HashConverter < RelatonBib::HashConverter
3
+ class << self
4
+ # @override RelatonIsoBib::HashConverter.hash_to_bib
5
+ # @param args [Hash]
6
+ # @param nested [TrueClass, FalseClass]
7
+ # @return [Hash]
8
+ def hash_to_bib(args, nested = false)
9
+ ret = super
10
+ return if ret.nil?
3
11
 
12
+ session_hash_to_bib ret
13
+ ret
14
+ end
15
+
16
+ private
17
+
18
+ # @param ret [Hash]
19
+ def session_hash_to_bib(ret)
20
+ ret[:session] = Session.new(ret[:session]) if ret[:session]
21
+ end
22
+
23
+ # @param ret [Hash]
24
+ def editorialgroup_hash_to_bib(ret)
25
+ eg = ret[:editorialgroup]
26
+ return unless eg
27
+
28
+ committee = eg.map { |e| e[:committee] }
29
+ ret[:editorialgroup] = EditorialGroup.new array(committee)
30
+ end
31
+ end
4
32
  end
5
33
  end
@@ -3,6 +3,51 @@
3
3
  module RelatonUn
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
+ # rubocop:disable Layout/LineLength
7
+
8
+ # There is distribution PRO (A/47/PV.102/CORR.1, A/47/PV.54)
9
+ BODY = {
10
+ "A" => "General Assembly",
11
+ "E" => "Economic and Social Council",
12
+ "S" => "Security Council",
13
+ "T" => "Trusteeship Council",
14
+ "ACC" => "Administrative Committee on Coordination",
15
+ "AT" => "United Nations Administrative Tribunal",
16
+ "CAT" => "Committee against Torture",
17
+ "CCPR" => "Human Rights Committee",
18
+ "CD" => "Conference on Disarmament",
19
+ "CEDAW" => "Committee on the Elimination of All Forms of Discrimination against Women",
20
+ "CERD" => "Committee on the Elimination of Racial Discrimination",
21
+ "CRC" => "Committee on the Rights of the Child",
22
+ "DC" => "Disarmament Commission",
23
+ "DP" => "United Nations Development Programme",
24
+ "HS" => "United Nations Centre for Human Settlements (HABITAT)",
25
+ "TD" => "United Nations Conference on Trade and Development",
26
+ "UNEP" => "United Nations Environment Programme",
27
+ "TRADE" => "Committee on Trade",
28
+ "CEFACT" => "Centre for Trade Facilitation and Electronic Business",
29
+ "C.1" => "Disarmament and International Security Committee",
30
+ "C.2" => "Economic and Financial Committee",
31
+ "C.3" => "Social, Humanitarian & Cultural Issues",
32
+ "C.4" => "Special Political and Decolonization Committee",
33
+ "C.5" => "Administrative and Budgetary Committee",
34
+ "C.6" => "Sixth Committee (Legal)",
35
+ "PC" => "Preparatory Committee",
36
+ "AEC" => "Atomic Energy Commission",
37
+ "AGRI" => "Committee on Agriculture",
38
+ "AMCEN" => "African Ministerial Conference on the Environment",
39
+ "AMCOW" => "African Ministers’ Council on Water",
40
+ "ECA" => "Economic Commission for Africa",
41
+ "ESCAP" => "Economic and Social Commission for Asia and Pacific",
42
+ "ECE" => "Economic Commission for Europe",
43
+ "ECWA" => "Economic Commission for Western Asia",
44
+ "UNFF" => "United Nations Forum on Forests",
45
+ "ENERGY" => "Committee on Sustainable Energy",
46
+ "FAO" => "Food and Agriculture Organization",
47
+ "UNCTAD" => "United Nations Conference on Trade and Development",
48
+ }.freeze
49
+ # rubocop:enable Layout/LineLength
50
+
6
51
  # Parse page.
7
52
  # @return [RelatonUn::UnBibliographicItem]
8
53
  def fetch
@@ -11,48 +56,97 @@ module RelatonUn
11
56
 
12
57
  private
13
58
 
59
+ # rubocop:disable Metrics/MethodLength
60
+
61
+ # @return [RelatonUn::UnBibliographicItem]
14
62
  def un_bib_item
15
63
  UnBibliographicItem.new(
16
64
  type: "standard",
17
65
  fetched: Date.today.to_s,
18
- docid: docid,
66
+ docid: fetch_docid,
19
67
  docnumber: hit[:ref],
20
68
  language: ["en"],
21
69
  script: ["Latn"],
22
- title: title,
23
- date: date,
24
- link: link,
25
- keyword: keyword
70
+ title: fetch_title,
71
+ date: fetch_date,
72
+ link: fetch_link,
73
+ keyword: fetch_keyword,
74
+ session: fetch_session,
75
+ distribution: fetch_distribution,
76
+ editorialgroup: fetch_editorialgroup,
77
+ classification: fetch_classification,
26
78
  )
27
79
  end
80
+ # rubocop:enable Metrics/MethodLength
28
81
 
29
82
  # @return [Array<RelatonBib::DocumentIdentifier>]
30
- def docid
31
- [RelatonBib::DocumentIdentifier.new(id: hit[:ref], type: "UN")]
83
+ def fetch_docid
84
+ hit[:symbol].map do |s|
85
+ RelatonBib::DocumentIdentifier.new(id: s, type: "UN")
86
+ end
32
87
  end
33
88
 
34
89
  # @return [Array<RelatonBib::TypedTitleString>]
35
- def title
36
- fs = RelatonBib::FormattedString.new(content: hit[:title], language: "en", script: "Latn")
37
- [RelatonBib::TypedTitleString.new(type: "main", title: fs)]
90
+ def fetch_title
91
+ # fs = RelatonBib::FormattedString.new(
92
+ # content: hit[:title], language: "en", script: "Latn",
93
+ # )
94
+ # [RelatonBib::TypedTitleString.new(type: "main", title: fs)]
95
+ # [{ title_main: hit[:title], language: "en", script: "Latn" }]
96
+ RelatonBib::TypedTitleString.from_string hit[:title], "en", "Latn"
38
97
  end
39
98
 
40
99
  # @return [Array<RelatonBib::BibliographicDate>]
41
- def date
100
+ def fetch_date
42
101
  d = []
43
- d << RelatonBib::BibliographicDate.new(type: "published", on: hit[:date_pub]) if hit[:date_pub]
44
- d << RelatonBib::BibliographicDate.new(type: "issued", on: hit[:date_rel]) if hit[:date_rel]
102
+ d << bibdate("published", hit[:date_pub]) if hit[:date_pub]
103
+ d << bibdate("issued", hit[:date_rel]) if hit[:date_rel]
45
104
  d
46
105
  end
47
106
 
107
+ # @param type [String]
108
+ # @param on [String]
109
+ # @return [RelatonBib::BibliographicDate]
110
+ def bibdate(type, on)
111
+ RelatonBib::BibliographicDate.new type: type, on: on
112
+ end
113
+
48
114
  # @return [Array<RelatonBib::TypedUri>]
49
- def link
115
+ def fetch_link
50
116
  hit[:link].map { |l| RelatonBib::TypedUri.new l }
51
117
  end
52
118
 
53
119
  # @return [Array<String>]
54
- def keyword
120
+ def fetch_keyword
55
121
  hit[:keyword].split(", ")
56
122
  end
123
+
124
+ # @return [RelatonUn::Session]
125
+ def fetch_session
126
+ Session.new(session_number: hit[:session], agenda_id: hit[:agenda])
127
+ end
128
+
129
+ # @return [String]
130
+ def fetch_distribution
131
+ UnBibliographicItem::DISTRIBUTIONS[hit[:distribution]]
132
+ end
133
+
134
+ # @return [RelatonUn::EditorialGroup, NilClass]
135
+ def fetch_editorialgroup
136
+ tc = hit[:ref].match(/^[\S]+/).to_s.split(/\/|-/).reduce([]) do |m, v|
137
+ if BODY[v] then m << BODY[v]
138
+ elsif v =~ /(AC|C|CN|CONF|GC|SC|Sub|WG).\d+|PC/ then m << v
139
+ else m
140
+ end
141
+ end.uniq
142
+ return unless tc.any?
143
+
144
+ RelatonUn::EditorialGroup.new tc
145
+ end
146
+
147
+ # @return [Array<RelatonBib::Classification>]
148
+ def fetch_classification
149
+ [RelatonBib::Classification.new(type: "area", value: "UNDOC")]
150
+ end
57
151
  end
58
152
  end
@@ -6,7 +6,8 @@ require "http-cookie"
6
6
  module RelatonUn
7
7
  # Page of hit collection.
8
8
  class HitCollection < RelatonBib::HitCollection
9
- AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
9
+ AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) "\
10
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
10
11
  DOMAIN = "https://documents.un.org"
11
12
  BOUNDARY = "----WebKitFormBoundary6hkaBvITDck8dHCn"
12
13
 
@@ -17,52 +18,10 @@ module RelatonUn
17
18
  @jar = HTTP::CookieJar.new
18
19
  @http = Net::HTTP.new @uri.host, @uri.port
19
20
  @http.use_ssl = true
21
+ @http.read_timeout = 120
20
22
  if (form_resp = get_page)
21
- form = Nokogiri::HTML form_resp.body
22
- form_data = form.xpath(
23
- "//input[@type!='radio']",
24
- "//input[@type='radio'][@checked]",
25
- "//select[@name!='view:_id1:_id2:cbLang']",
26
- "//textarea"
27
- ).reduce([]) do |m, i|
28
- v = case i[:name]
29
- when "view:_id1:_id2:txtSymbol" then text
30
- when "view:_id1:_id2:cbType" then "FP"
31
- when "view:_id1:_id2:cbSort" then "R"
32
- when "$$xspsubmitid" then "view:_id1:_id2:_id130"
33
- when "$$xspsubmitscroll" then "0|167"
34
- else i[:value]
35
- end
36
- m << %{--#{BOUNDARY}}
37
- m << %{Content-Disposition: form-data; name="#{i[:name]}"\r\n\r\n#{v}}
38
- end
39
- form_data << %{--#{BOUNDARY}--\r\n}
40
- req = Net::HTTP::Post.new form.at("//form")[:action]
41
- set_headers req
42
- req["Content-Type"] = "multipart/form-data, boundary=#{BOUNDARY}"
43
- req.body = form_data.join("\r\n")
44
- resp = @http.request req
45
- page_resp = get_page URI.parse(resp["location"]).request_uri
46
- doc = Nokogiri::HTML page_resp.body
47
- @array = doc.css("div.viewHover").map do |item|
48
- ref = item.at("div/div/a")&.text&.sub "\u00A0", ""
49
- title = item.at("div/div/span")&.text
50
- keyword = item.at("div[3]/div[5]/span")&.text
51
- date_pub = item.at("//label[.='Publication Date: ']/following-sibling::span")&.text
52
- en = item.at("//span[.='ENGLISH']/../..")
53
- date_rel = en.at("./following-sibling::span[contains(@id, 'cfRelDateE')]").text
54
- link = en.xpath("//a[contains(@title, 'Open')]").map do |l|
55
- { content: l[:href], type: l[:title].match(/PDF|Word/).to_s.downcase }
56
- end
57
- Hit.new({
58
- ref: ref,
59
- title: title,
60
- keyword: keyword,
61
- date_pub: date_pub,
62
- date_rel: date_rel,
63
- link: link
64
- }, self)
65
- end
23
+ doc = Nokogiri::HTML page_resp(form_resp, text).body
24
+ @array = doc.css("div.viewHover").map { |item| hit item }
66
25
  end
67
26
  end
68
27
 
@@ -84,9 +43,125 @@ module RelatonUn
84
43
  get_page request_uri, deep + 1
85
44
  end
86
45
 
46
+ # rubocop:disable Metrics/MethodLength
47
+
48
+ # @param form [Nokogiri::HTML::Document]
49
+ # @param text [String]
50
+ # @return [Array<String>]
51
+ def form_data(form, text)
52
+ fd = form.xpath(
53
+ "//input[@type!='radio']",
54
+ "//input[@type='radio'][@checked]",
55
+ "//select[@name!='view:_id1:_id2:cbLang']",
56
+ "//textarea",
57
+ ).reduce([]) do |m, i|
58
+ v = case i[:name]
59
+ when "view:_id1:_id2:txtSymbol" then text
60
+ when "view:_id1:_id2:cbType" then "FP"
61
+ when "view:_id1:_id2:cbSort" then "R"
62
+ when "$$xspsubmitid" then "view:_id1:_id2:_id130"
63
+ when "$$xspsubmitscroll" then "0|167"
64
+ else i[:value]
65
+ end
66
+ m << %{--#{BOUNDARY}}
67
+ m << %{Content-Disposition: form-data; name="#{i[:name]}"\r\n\r\n#{v}}
68
+ end
69
+ fd << %{--#{BOUNDARY}--\r\n}
70
+ end
71
+ # rubocop:enable Metrics/MethodLength
72
+
73
+ # @param form_resp [Net::HTTPOK]
74
+ # @param text [String]
75
+ # @return [Net::HTTPOK]
76
+ def page_resp(form_resp, text)
77
+ form = Nokogiri::HTML form_resp.body
78
+ req = Net::HTTP::Post.new form.at("//form")[:action]
79
+ set_headers req
80
+ req["Content-Type"] = "multipart/form-data, boundary=#{BOUNDARY}"
81
+ req.body = form_data(form, text).join("\r\n")
82
+ resp = @http.request req
83
+ get_page URI.parse(resp["location"]).request_uri
84
+ end
85
+
86
+ # @param item [Nokogiri::XML::Element]
87
+ # @return [RelatonUn::Hit]
88
+ def hit(item)
89
+ Hit.new(hit_data(item), self)
90
+ end
91
+
92
+ # @param item [Nokogiri::XML::Element]
93
+ # @return [Hash]
94
+ def hit_data(item)
95
+ en = item.at("//span[.='ENGLISH']/../..")
96
+ {
97
+ ref: item.at("div/div/a")&.text&.sub("\u00A0", ""),
98
+ symbol: symbol(item),
99
+ title: item.at("div/div/span")&.text,
100
+ keyword: item.at("div[3]/div[5]/span")&.text,
101
+ date_pub: date_pub(item),
102
+ date_rel: date_rel(en),
103
+ link: link(en),
104
+ session: session(item),
105
+ agenda: agenda(item),
106
+ distribution: distribution(item)
107
+ }
108
+ end
109
+
110
+ # @param item [Nokogiri::XML::Element]
111
+ # @return [String]
112
+ def symbol(item)
113
+ item.xpath("div/div[not(contains(@class, 'hidden'))]/"\
114
+ "label[contains(.,'Symbol')]/following-sibling::span[1]").map &:text
115
+ end
116
+
117
+ # @param item [Nokogiri::XML::Element]
118
+ # @return [String]
119
+ def date_pub(item)
120
+ item.at("//label[.='Publication Date: ']/following-sibling::span")&.text
121
+ end
122
+
123
+ # @param item [Nokogiri::XML::Element]
124
+ # @return [String]
125
+ def date_rel(item)
126
+ item.at("./following-sibling::span[contains(@id, 'cfRelDateE')]")&.text
127
+ end
128
+
129
+ # @param item [Nokogiri::XML::Element]
130
+ # @return [Array<Hash>]
131
+ def link(item)
132
+ item.xpath("//a[contains(@title, 'Open')]").map do |l|
133
+ {
134
+ content: l[:href],
135
+ type: l[:title].match(/PDF|Word/).to_s.downcase,
136
+ }
137
+ end
138
+ end
139
+
140
+ # @param item [Nokogiri::XML::Element]
141
+ # @return [String]
142
+ def session(item)
143
+ item.at("//label[.='Session / Year:']/following-sibling::span")&.text
144
+ end
145
+
146
+ # @param item [Nokogiri::XML::Element]
147
+ # @return [String]
148
+ def agenda(item)
149
+ item.at("//label[.='Agenda Item(s):']/following-sibling::span")&.text
150
+ end
151
+
152
+ # @param item [Nokogiri::XML::Element]
153
+ # @return [String]
154
+ def distribution(item)
155
+ item.at("//label[.='Distribution:']/following-sibling::span")&.text
156
+ end
157
+
158
+ # rubocop:disable Metrics/MethodLength
159
+
160
+ # @param req [Net::HTTP::Get, Net::HTTP::Post]
87
161
  def set_headers(req)
88
162
  set_cookie req
89
- req["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
163
+ req["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,"\
164
+ "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
90
165
  req["Accept-Encoding"] = "gzip, deflate, br"
91
166
  req["Cache-Control"] = "max-age=0"
92
167
  req["Connection"] = "keep-alive"
@@ -98,7 +173,9 @@ module RelatonUn
98
173
  req["Upgrade-Insecure-Requests"] = "1"
99
174
  req["User-Agent"] = AGENT
100
175
  end
176
+ # rubocop:enable Metrics/MethodLength
101
177
 
178
+ # @param req [Net::HTTP::Get, Net::HTTP::Post]
102
179
  def set_cookie(req)
103
180
  req["Cookie"] = HTTP::Cookie.cookie_value @jar.cookies(@uri)
104
181
  end