bipm-data-importer 0.1.3 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fcde8a899380a032040ac0f44f380edda5b0f0a6c6591dc874a9e4f90504544
4
- data.tar.gz: ba4cce57619f4356eb66309d75347fa24db76ba67936fb0e0cc161b97745c13e
3
+ metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
4
+ data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
5
5
  SHA512:
6
- metadata.gz: db266f37ca351605df8e13ea5c75fae2fe9b9ddbd8038e4392233e2e103d28e1a33c11a7f1bf397c93a8e5f32afdb293045341a776fd09f16b4cae5a054ac340
7
- data.tar.gz: 46bec77135211aec51a90a03f21dbc72f9cc1443108b965be211449f89b963d90f87a58e354a5a33ec9a750fa0c0ea6357382f4ecbf1f76434c54bd3b3818a0f
6
+ metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
7
+ data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  spec.add_dependency "nokogiri"
32
32
  spec.add_dependency "mechanize"
33
- spec.add_dependency "reverse_adoc"
33
+ spec.add_dependency "coradoc"
34
34
  spec.add_dependency "pry"
35
35
 
36
36
  spec.add_dependency "vcr"
data/exe/bipm-fetch CHANGED
@@ -15,7 +15,8 @@ bodies = {
15
15
  "CCL": 'https://www.bipm.org/en/committees/cc/ccl',
16
16
  "CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
17
17
  "CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
18
- "CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
18
+ "CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
19
+ "CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
19
20
  }
20
21
 
21
22
  BASE_DIR = "data"
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
64
65
 
65
66
  title = meeting_div.at_css('.meetings-list__informations-title').text.strip
66
67
  href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
68
+ href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
67
69
 
68
70
  ident = href.split("/#{body}/").last.gsub('/', '.')
69
71
  yr = href.include?("/wg/") ? nil : href.split('-').last
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
124
126
 
125
127
  h["resolutions"] = resolutions.map do |href|
126
128
  href = href.gsub('/web/guest/', "/#{meeting_lang}/")
129
+ href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
127
130
 
128
131
  # error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
129
132
  href = href.gsub('/104-2015/', '/104-_1-2015/')
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
343
346
  "documents" => i.css(".publications__content").map do |d|
344
347
  {
345
348
  "title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
346
- "pdf" => d.at_css(".title-third").attr("href").split('?').first,
349
+ "pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
347
350
  # "description" => d.css('.publications__body')[0]&.text&.strip,
348
351
  # "author" => d.css('.publications__body')[1]&.text&.strip,
349
352
  }.compact
data/exe/bipm-fetch-cgpm CHANGED
@@ -1,71 +1,3 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/bipm-data-importer'
4
-
5
- BASE_DIR = "data"
6
- a = Mechanize.new
7
-
8
- meetings_en = VCR.use_cassette 'cgpm-meetings' do
9
- a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
10
- end
11
-
12
- meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
13
- a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
14
- end
15
-
16
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
17
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
18
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
19
-
20
- [['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
21
- urls = meetings.css('div.publications__content').map do |option|
22
- url = option.at_css('a').attr('href')
23
- url = url.gsub('/web/guest/', "/#{meeting_lang}/")
24
- url.split('/').first(8).join('/')
25
- end.uniq
26
-
27
- urls.each do |url|
28
- meeting_id = url.split('/').last.to_i
29
- meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
30
- meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
31
- meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
32
-
33
- title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
34
- date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
35
-
36
- pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
37
-
38
- h = {
39
- "metadata" => {
40
- "title" => title,
41
- "identifier" => meeting_id,
42
- "date" => date.to_s,
43
- "source" => "BIPM - Pavillon de Breteuil",
44
- "url" => meeting.uri.to_s
45
- }
46
- }
47
-
48
- h["pdf"] = pdf if pdf
49
-
50
- resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
51
-
52
- # A mistake on a website, resolution 5 listed 4 times...
53
- # https://www.bipm.org/fr/committees/cg/cgpm/8-1933
54
- if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
55
- resolutions = (1..15).map do |i|
56
- "https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
57
- end
58
- end
59
-
60
- h["resolutions"] = resolutions.map do |res_link|
61
- res_id = (res_link.split('-')[2] || 0).to_i
62
- res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
63
- res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
64
-
65
- Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
66
- end
67
-
68
- FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
69
- File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
70
- end
71
- end
3
+ warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
@@ -1,20 +1,19 @@
1
- require 'mechanize'
2
- require 'reverse_adoc'
3
- require 'vcr'
4
- require 'date'
5
- require 'fileutils'
6
- require 'pry'
7
- require_relative 'asciimath'
1
+ require "mechanize"
2
+ require "coradoc/input/html"
3
+ require "vcr"
4
+ require "date"
5
+ require "fileutils"
6
+ require "pry"
7
+ require_relative "asciimath"
8
8
 
9
9
  VCR.configure do |c|
10
- c.cassette_library_dir = __dir__+'/../../../../cassettes'
10
+ c.cassette_library_dir = __dir__ + "/../../../../cassettes"
11
11
  c.hook_into :webmock
12
12
  end
13
13
 
14
14
  module Bipm
15
15
  module Data
16
16
  module Importer
17
-
18
17
  CONSIDERATIONS = {
19
18
  /(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
20
19
  /(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
@@ -50,7 +49,7 @@ module Bipm
50
49
  /(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
51
50
  /(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
52
51
  /(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
53
- /(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
52
+ /(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
54
53
  /(?:resolve[sd]?)/i => "resolves",
55
54
  /(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
56
55
  /(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
@@ -79,50 +78,52 @@ module Bipm
79
78
  /(?:empowers|habilite)/i => "empowers",
80
79
  }
81
80
 
82
- PREFIX1=/(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
83
- PREFIX2=/The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
84
- PREFIX3=/Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
85
- PREFIX4=/(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
86
- PREFIX5=/(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
87
- PREFIX6=/“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
81
+ PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
82
+ PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
83
+ PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
84
+ PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
85
+ PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
86
+ PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
87
+
88
+ PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
88
89
 
89
- PREFIX=/(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
90
+ SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
90
91
 
91
- SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
+ DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
92
93
 
93
94
  module Common
94
- def replace_links ps, res, lang
95
- ps.css('a[href]').each do |a|
96
- href = a.attr('href')
95
+ def replace_links(ps, res, lang)
96
+ ps.css("a[href]").each do |a|
97
+ href = a.attr("href")
97
98
 
98
- href = href.gsub(%r'\Ahttps://www.bipm.org/', '')
99
+ href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
99
100
 
100
101
  # Correct links
101
- href = href.gsub('/web/guest/', "/#{lang}/")
102
+ href = href.gsub("/web/guest/", "/#{lang}/")
102
103
 
103
104
  # Account for some mistakes from an upstream document
104
- href = href.gsub(%r"\A/jen/", '/en/')
105
- href = href.gsub(%r"\A/en/CGPM/jsp/", '/en/CGPM/db/')
105
+ href = href.gsub(%r"\A/jen/", "/en/")
106
+ href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
106
107
 
107
108
  href = case href
108
- when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
109
- %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
110
- %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
111
- "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
112
- when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
113
- "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
114
- when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
115
- "cipm-decisions:#{$1}/#{$2}#{$3}"
116
- else
117
- URI(res.uri).merge(href).to_s # Relative -> absolute
118
- end
109
+ when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
110
+ %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
111
+ %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
112
+ "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
113
+ when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
114
+ "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
115
+ when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
116
+ "cipm-decisions:#{$1}/#{$2}#{$3}"
117
+ else
118
+ URI(res.uri).merge(href).to_s # Relative -> absolute
119
+ end
119
120
 
120
- a.set_attribute('href', href)
121
+ a.set_attribute("href", href)
121
122
  end
122
123
  end
123
124
 
124
- def replace_centers ps
125
- centers = ps.css('center').to_a
125
+ def replace_centers(ps)
126
+ centers = ps.css("center").to_a
126
127
  while centers.length > 0
127
128
  center = centers.first
128
129
  current = center
@@ -131,7 +132,7 @@ module Bipm
131
132
  break unless current.next
132
133
  while Nokogiri::XML::Text === current.next
133
134
  current = current.next
134
- break if current.text.strip != ''
135
+ break if current.text.strip != ""
135
136
  end
136
137
  break unless current.next
137
138
  break unless current.next.name == "center"
@@ -157,47 +158,52 @@ module Bipm
157
158
  end
158
159
 
159
160
  # Remove the remaining centers
160
- ps.css('center').each do |i|
161
+ ps.css("center").each do |i|
161
162
  i.replace i.inner_html
162
163
  end
163
164
  end
164
165
 
165
- def format_message part
166
+ def format_message(part)
166
167
  AsciiMath.asciidoc_extract_math(
167
- ReverseAdoc.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", " ").gsub(" \n", "\n")
168
169
  )
170
+ rescue
171
+ warn "Bug in Coradoc, couldn't parse the following document:"
172
+ pp part
173
+ warn "Please report this as an issue to https://github.com/metanorma/coradoc"
174
+ raise
169
175
  end
170
176
 
171
- def ng_to_string ps
172
- ps.inner_html.encode('utf-8').gsub("\r", '').gsub(%r'</?nobr>','')
177
+ def ng_to_string(ps)
178
+ ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
173
179
  end
174
180
 
175
- def parse_resolution res, res_id, date, type = :cgpm, lang = 'en', rec_type = nil
181
+ def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
176
182
  # Reparse the document after fixing upstream syntax
177
183
  fixed_body = res.body.gsub("<name=", "<a name=")
178
- fixed_body = fixed_body.force_encoding('utf-8')
179
- fixed_body = fixed_body.gsub('&Eacute;', 'É')
180
- fixed_body = fixed_body.gsub('&#171;&#032;', '« ')
181
- fixed_body = fixed_body.gsub('&#032;&#187;', ' »')
182
- fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, '')
184
+ fixed_body = fixed_body.force_encoding("utf-8")
185
+ fixed_body = fixed_body.gsub("&Eacute;", "É")
186
+ fixed_body = fixed_body.gsub("&#171;&#032;", "« ")
187
+ fixed_body = fixed_body.gsub("&#032;&#187;", " »")
188
+ fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
183
189
  supertitle = $1.strip
184
- fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, '')
190
+ fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
185
191
  title = $1.strip
186
192
  fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
187
193
  fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
188
194
  ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
189
195
 
190
- refs = ng.css('.publication-card_reference a')
196
+ refs = ng.css(".publication-card_reference a")
191
197
 
192
198
  if rec_type.end_with? "?"
193
199
  rec_type = case supertitle
194
- when /\AD[eé]claration/
195
- "declaration"
196
- when /\AR[eé]solution/
197
- "resolution"
198
- else
199
- rec_type[..-2]
200
- end
200
+ when /\AD[eé]claration/
201
+ "statement"
202
+ when /\AR[eé]solution/
203
+ "resolution"
204
+ else
205
+ rec_type[..-2]
206
+ end
201
207
  end
202
208
 
203
209
  r = {
@@ -220,7 +226,7 @@ module Bipm
220
226
  r.delete("type") unless r["type"]
221
227
 
222
228
  if refs.length > 0
223
- r["reference"] = res.uri.merge(refs.first.attr('href')).to_s.split('?').first
229
+ r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
224
230
  name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
225
231
  r["reference_name"] = name
226
232
  if page
@@ -234,7 +240,7 @@ module Bipm
234
240
  r.delete("reference_page")
235
241
  end
236
242
 
237
- ps = ng.css('div.journal-content-article').first
243
+ ps = ng.css("div.journal-content-article").first
238
244
 
239
245
  #binding.pry if ps.count != 1
240
246
 
@@ -247,6 +253,11 @@ module Bipm
247
253
  doc = Common.ng_to_string(ps)
248
254
  # doc = AsciiMath.html_to_asciimath(doc)
249
255
 
256
+ if doc.match? DOIREGEX
257
+ doc = doc.sub(DOIREGEX, "")
258
+ r["doi"] = $1
259
+ end
260
+
250
261
  parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
251
262
  nparts = [parts.shift]
252
263
  while parts.length > 0
@@ -267,13 +278,13 @@ module Bipm
267
278
  next
268
279
  end
269
280
 
270
- if parse.start_with? 'NOTE'
271
- part = part.sub('<h3>NOTE</h3>', '')
281
+ if parse.start_with? "NOTE"
282
+ part = part.sub("<h3>NOTE</h3>", "")
272
283
  r["notes"] = Common.format_message(part)
273
284
  next
274
285
  end
275
286
 
276
- CONSIDERATIONS.any? do |k,v|
287
+ CONSIDERATIONS.any? do |k, v|
277
288
  if parse =~ /\A#{PREFIX}#{k}\b/i
278
289
  r["considerations"] << prev = {
279
290
  "type" => v,
@@ -283,7 +294,7 @@ module Bipm
283
294
  end
284
295
  end && next
285
296
 
286
- ACTIONS.any? do |k,v|
297
+ ACTIONS.any? do |k, v|
287
298
  if parse =~ /\A#{PREFIX}#{k}\b/i
288
299
  r["actions"] << prev = {
289
300
  "type" => v,
@@ -314,13 +325,13 @@ module Bipm
314
325
  end
315
326
 
316
327
  %w[considerations actions].each do |type|
317
- map = type == 'actions' ? ACTIONS : CONSIDERATIONS
328
+ map = type == "actions" ? ACTIONS : CONSIDERATIONS
318
329
  r[type] = r[type].map do |i|
319
330
  islist = false
320
331
 
321
332
  kk = nil
322
333
 
323
- if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
334
+ if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
324
335
  prefix = $2
325
336
  suffix = $3
326
337
  subject = $4
@@ -347,15 +358,15 @@ module Bipm
347
358
 
348
359
  if subject
349
360
  #p subject
350
- r['subject'] ||= []
351
- r['subject'] << subject
361
+ r["subject"] ||= []
362
+ r["subject"] << subject
352
363
  end
353
364
 
354
365
  if islist
355
366
  suffix = suffix.strip
356
- suffix = nil if suffix == ''
367
+ suffix = nil if suffix == ""
357
368
  listitems.map do |li|
358
- i.merge 'message' => [prefix, suffix, li].compact.join(" ")
369
+ i.merge "message" => [prefix, suffix, li].compact.join(" ")
359
370
  end
360
371
  else
361
372
  i
@@ -363,13 +374,13 @@ module Bipm
363
374
  end.flatten
364
375
  end
365
376
 
366
- if r['subject']
367
- r['subject'] = r['subject'].uniq.join(" and ")
377
+ if r["subject"]
378
+ r["subject"] = r["subject"].uniq.join(" and ")
368
379
  end
369
380
 
370
381
  # Note: we replace the previously set r['subject'].
371
- r['subject'] = type.to_s.upcase.gsub("-", ' ')
372
- r['subject'] = 'CCDS' if type == :cctf && supertitle.include?("CCDS")
382
+ r["subject"] = type.to_s.upcase.gsub("-", " ")
383
+ r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
373
384
 
374
385
  r
375
386
  end
@@ -377,11 +388,11 @@ module Bipm
377
388
  def extract_pdf(meeting, lang)
378
389
  pdfs = meeting.css('a.title-third[href*=".pdf"]')
379
390
  .map { |i| i.attr("href") }
380
- .map { |i| i.split('?').first }
391
+ .map { |i| i.split("?").first }
381
392
  .select do |i|
382
- i.downcase.include?("-#{lang}.pdf") ||
383
- %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
384
- end
393
+ i.downcase.include?("-#{lang}.pdf") ||
394
+ %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
395
+ end
385
396
 
386
397
  pdfs = pdfs.first if pdfs.length <= 1
387
398
 
@@ -389,8 +400,10 @@ module Bipm
389
400
  end
390
401
 
391
402
  def extract_date(date_str)
403
+ return nil unless date_str
404
+
392
405
  date = date_str.strip
393
- .gsub(/\s+/, ' ')
406
+ .gsub(/\s+/, " ")
394
407
  .gsub("février", "february") # 3 first letters must match English
395
408
  .gsub("juin", "june")
396
409
  .gsub("avril", "april")
@@ -411,7 +424,6 @@ module Bipm
411
424
 
412
425
  extend self
413
426
  end
414
-
415
427
  end
416
428
  end
417
429
  end
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.1.3"
6
+ VERSION = "0.2.1"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-01-06 00:00:00.000000000 Z
11
+ date: 2024-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: reverse_adoc
42
+ name: coradoc
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
206
  - !ruby/object:Gem::Version
207
207
  version: '0'
208
208
  requirements: []
209
- rubygems_version: 3.3.26
209
+ rubygems_version: 3.3.27
210
210
  signing_key:
211
211
  specification_version: 4
212
212
  summary: Importer for BIPM CGPM and CIPM content