bipm-data-importer 0.1.3 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8fcde8a899380a032040ac0f44f380edda5b0f0a6c6591dc874a9e4f90504544
4
- data.tar.gz: ba4cce57619f4356eb66309d75347fa24db76ba67936fb0e0cc161b97745c13e
3
+ metadata.gz: 3bf2e9209301b13261c1b15013611cd17cf5355d6152c1f59d131aa486bbec6c
4
+ data.tar.gz: d1bf3ab0059a945f134068cf3c9cab045fdb6c9970edd3dc0cb2ce839c3dce9b
5
5
  SHA512:
6
- metadata.gz: db266f37ca351605df8e13ea5c75fae2fe9b9ddbd8038e4392233e2e103d28e1a33c11a7f1bf397c93a8e5f32afdb293045341a776fd09f16b4cae5a054ac340
7
- data.tar.gz: 46bec77135211aec51a90a03f21dbc72f9cc1443108b965be211449f89b963d90f87a58e354a5a33ec9a750fa0c0ea6357382f4ecbf1f76434c54bd3b3818a0f
6
+ metadata.gz: f653f802d49a88b5e9f0f5b6380a1c9a0fdd73e52be093cae3bc62020d9057221dc9b9264a75bf10198b154f16dd8010480056ae624f7a1c49e7659e1b87b6fc
7
+ data.tar.gz: dd797598112540212aa451e62089b4dde8dc89e4cbfd85e83e6c653e002e65fbab8783ca4cfaab6d8c10a4442e90cacbb6c057b02615c270bf6b7dc3f531a3e0
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
30
30
 
31
31
  spec.add_dependency "nokogiri"
32
32
  spec.add_dependency "mechanize"
33
- spec.add_dependency "reverse_adoc"
33
+ spec.add_dependency "coradoc"
34
34
  spec.add_dependency "pry"
35
35
 
36
36
  spec.add_dependency "vcr"
data/exe/bipm-fetch CHANGED
@@ -15,7 +15,8 @@ bodies = {
15
15
  "CCL": 'https://www.bipm.org/en/committees/cc/ccl',
16
16
  "CCEM": 'https://www.bipm.org/en/committees/cc/ccem',
17
17
  "CCAUV": 'https://www.bipm.org/en/committees/cc/ccauv',
18
- "CIPM": 'https://www.bipm.org/en/committees/ci/cipm'
18
+ "CIPM": 'https://www.bipm.org/en/committees/ci/cipm',
19
+ "CGPM": 'https://www.bipm.org/en/committees/cg/cgpm',
19
20
  }
20
21
 
21
22
  BASE_DIR = "data"
@@ -64,6 +65,7 @@ bodies.each do |bodyid, bodyurl|
64
65
 
65
66
  title = meeting_div.at_css('.meetings-list__informations-title').text.strip
66
67
  href = meeting_div.at_css('.meetings-list__informations-title').attr('href')
68
+ href = "/#{meeting_lang}" + href unless href.start_with? "/#{meeting_lang}/"
67
69
 
68
70
  ident = href.split("/#{body}/").last.gsub('/', '.')
69
71
  yr = href.include?("/wg/") ? nil : href.split('-').last
@@ -124,6 +126,7 @@ bodies.each do |bodyid, bodyurl|
124
126
 
125
127
  h["resolutions"] = resolutions.map do |href|
126
128
  href = href.gsub('/web/guest/', "/#{meeting_lang}/")
129
+ href = href.sub("www.bipm.org/", "www.bipm.org/#{meeting_lang}/") unless href.include? "/#{meeting_lang}/"
127
130
 
128
131
  # error: https://www.bipm.org/fr/committees/ci/cipm/104-_1-2015 has wrong references to Recommandations
129
132
  href = href.gsub('/104-2015/', '/104-_1-2015/')
@@ -343,7 +346,7 @@ bodies.each do |bodyid, bodyurl|
343
346
  "documents" => i.css(".publications__content").map do |d|
344
347
  {
345
348
  "title" => d.at_css(".title-third").text.strip.gsub(/\s+/, ' '),
346
- "pdf" => d.at_css(".title-third").attr("href").split('?').first,
349
+ "pdf" => d.at_css(".title-third")&.attr("href")&.split('?')&.first,
347
350
  # "description" => d.css('.publications__body')[0]&.text&.strip,
348
351
  # "author" => d.css('.publications__body')[1]&.text&.strip,
349
352
  }.compact
data/exe/bipm-fetch-cgpm CHANGED
@@ -1,71 +1,3 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/bipm-data-importer'
4
-
5
- BASE_DIR = "data"
6
- a = Mechanize.new
7
-
8
- meetings_en = VCR.use_cassette 'cgpm-meetings' do
9
- a.get "https://www.bipm.org/en/committees/cg/cgpm/cgpm-resolutions"
10
- end
11
-
12
- meetings_fr = VCR.use_cassette 'cgpm-meetings-fr' do
13
- a.get "https://www.bipm.org/fr/committees/cg/cgpm/cgpm-resolutions"
14
- end
15
-
16
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings"
17
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-fr"
18
- FileUtils.rm_rf "#{BASE_DIR}/cgpm/meetings-en"
19
-
20
- [['en', meetings_en], ['fr', meetings_fr]].each do |meeting_lang, meetings|
21
- urls = meetings.css('div.publications__content').map do |option|
22
- url = option.at_css('a').attr('href')
23
- url = url.gsub('/web/guest/', "/#{meeting_lang}/")
24
- url.split('/').first(8).join('/')
25
- end.uniq
26
-
27
- urls.each do |url|
28
- meeting_id = url.split('/').last.to_i
29
- meeting_lang_sfx = (meeting_lang == 'fr') ? "-fr" : ""
30
- meeting_lang_sfx_dir = (meeting_lang == 'fr') ? "-fr" : "-en"
31
- meeting = VCR.use_cassette("cgpm-meeting-#{meeting_id}#{meeting_lang_sfx}") { a.get url }
32
-
33
- title = meeting.at_css('h1.session__title, .journal-content-article h1').text.strip
34
- date = Bipm::Data::Importer::Common.extract_date(meeting.at_css('p.session__date, .journal-content-article h2').text)
35
-
36
- pdf = Bipm::Data::Importer::Common.extract_pdf(meeting, meeting_lang)
37
-
38
- h = {
39
- "metadata" => {
40
- "title" => title,
41
- "identifier" => meeting_id,
42
- "date" => date.to_s,
43
- "source" => "BIPM - Pavillon de Breteuil",
44
- "url" => meeting.uri.to_s
45
- }
46
- }
47
-
48
- h["pdf"] = pdf if pdf
49
-
50
- resolutions = meeting.links_with(href: %r</resolution->).map(&:href)
51
-
52
- # A mistake on a website, resolution 5 listed 4 times...
53
- # https://www.bipm.org/fr/committees/cg/cgpm/8-1933
54
- if meeting_id == 8 && meeting_lang == 'fr' && resolutions.sort.uniq != resolutions.sort
55
- resolutions = (1..15).map do |i|
56
- "https://www.bipm.org/en/committees/cg/cgpm/8-1933/resolution-#{i}"
57
- end
58
- end
59
-
60
- h["resolutions"] = resolutions.map do |res_link|
61
- res_id = (res_link.split('-')[2] || 0).to_i
62
- res_link = res_link.gsub('/web/guest/', "/#{meeting_lang}/")
63
- res = VCR.use_cassette("cgpm-resolution-#{meeting_id}-#{res_id}#{meeting_lang_sfx}") { a.get res_link }
64
-
65
- Bipm::Data::Importer::Common.parse_resolution(res, res_id, date, :cgpm, meeting_lang, "resolution?")
66
- end
67
-
68
- FileUtils.mkdir_p("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}")
69
- File.write("#{BASE_DIR}/cgpm/meetings#{meeting_lang_sfx_dir}/meeting-#{"%02d" % meeting_id}.yml", YAML.dump(h))
70
- end
71
- end
3
+ warn "DEPRECATED: This tool is no longer needed. Please call `bipm-fetch` only."
@@ -1,20 +1,19 @@
1
- require 'mechanize'
2
- require 'reverse_adoc'
3
- require 'vcr'
4
- require 'date'
5
- require 'fileutils'
6
- require 'pry'
7
- require_relative 'asciimath'
1
+ require "mechanize"
2
+ require "coradoc/input/html"
3
+ require "vcr"
4
+ require "date"
5
+ require "fileutils"
6
+ require "pry"
7
+ require_relative "asciimath"
8
8
 
9
9
  VCR.configure do |c|
10
- c.cassette_library_dir = __dir__+'/../../../../cassettes'
10
+ c.cassette_library_dir = __dir__ + "/../../../../cassettes"
11
11
  c.hook_into :webmock
12
12
  end
13
13
 
14
14
  module Bipm
15
15
  module Data
16
16
  module Importer
17
-
18
17
  CONSIDERATIONS = {
19
18
  /(?:having(?: regard)?|ayant|concerne|vu la|agissant conformément|sachant|de porter)/i => "having / having regard",
20
19
  /(?:noting|to note|took note|note[sd]?|taking note|takes note|constatant|constate|that|notant|notant que|note également|(?:prend|prenant) (?:acte|note))/i => "noting",
@@ -50,7 +49,7 @@ module Bipm
50
49
  /(?:Appendix 1 of the|L'Annexe 1 de la|increased|a (?:examiné|préparé)|transmettra|fournira|increased|developed a document|prendra contact)/i => "declares",
51
50
  /(?:Le Temps Atomique International |International Atomic Time \(TAI\) |will meet )/i => "declares",
52
51
  /(?:ask[s ]|asked|souhaite|souhaiterait)/i => "asks",
53
- /(?:(?:further )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
52
+ /(?:(?:further |et )?invit(?:[ée][ds]?|era)|renouvelle en conséquence|convient d'inviter)/i => "invites / further invites",
54
53
  /(?:resolve[sd]?)/i => "resolves",
55
54
  /(?:confirms|confirmed?|confirme que|committed|s'engageant)/i => "confirms",
56
55
  /(?:welcom(?:e[sd]?|ing)|accueille favorablement(?:les)?|salu(?:e|ant))/i => "welcomes",
@@ -79,50 +78,52 @@ module Bipm
79
78
  /(?:empowers|habilite)/i => "empowers",
80
79
  }
81
80
 
82
- PREFIX1=/(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
83
- PREFIX2=/The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
84
- PREFIX3=/Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
85
- PREFIX4=/(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
86
- PREFIX5=/(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
87
- PREFIX6=/“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
81
+ PREFIX1 = /(?:The|Le) CIPM |La Conférence |M. Volterra |M. le Président |unanimously |would |a |sont |will |were |did not |strongly |(?:La|The) (?:\d+(?:e|th)|Quinzième) Conférence Générale des Poids et Mesures(?: a |,\s+)?/i
82
+ PREFIX2 = /The \d+th Conférence Générale des Poids et Mesures |The Conference |and |et (?:en |)|has |renouvelle sa |renews its |further |and further |En ce qui |après avoir |\.\.\.\n+\t*/i
83
+ PREFIX3 = /Sur la proposition de M. le Président, la convocation de cette Conférence de Thermométrie est |Le texte corrigé, finalement |(?:The|Le) Comité International(?: des Poids et Mesures)?(?: \(CIPM\))?(?: a |,)?\s*/i
84
+ PREFIX4 = /(?:The |Le |)(?:JCRB|JCGM|CCU|CCTF|CCT|CCRI|CCPR|CCQM|CCM|CCL|CCEM|CCAUV|KCDB),? (?:also |)|Each RMO |fully |The JCRB Rules of Procedure are |Bob Watters and Claudine Thomas /
85
+ PREFIX5 = /(?:The |Le |All |)(?:incoming |)(?:JCRB |KCDB |)(?:documents|(?:Consultative |)Committees?|Office|Chairman(?: and Secretary|)|Joint BIPM[\/-]ILAC Working Group(?: \(see Action 22\))|RMO(?:[- ]JCRB|) Representatives(?: to the JRCB|)|(?:BIPM |)Director(?: of BIPM|)|SIM|(?:Exec(?:utive|) |)Secretary(?:\(ies\)|)|RMOs, except SIM,|RMOs|APMP|\(?(?:[MD]r|Prof) [A-Z][a-zR-]+\)?|CMCs|EUR[AO]MET|COOMET|GULFMET) |It was /
86
+ PREFIX6 = /“|"|« à |All RMO documents related to review procedures |Mr Lam and Dr Kühne |The Prof. Kühne, Mr Jones and the Executive Secretary |Ajchara Charoensook, from APMP, /
87
+
88
+ PREFIX = /(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
88
89
 
89
- PREFIX=/(?:#{PREFIX1}|#{PREFIX2}|#{PREFIX3}|#{PREFIX4}|#{PREFIX5}|#{PREFIX6})?/i
90
+ SUFFIX = / (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
90
91
 
91
- SUFFIX=/ (?:that|que)\b|(?: (?:the |that |le |que les )?((?:[A-Z]|national|laboratoires).{0,80}?)(?: to)?\b|)/
92
+ DOIREGEX = %r'\s+<p>\s+<b>DOI :</b> (.*?)\s+</p>\n\n'
92
93
 
93
94
  module Common
94
- def replace_links ps, res, lang
95
- ps.css('a[href]').each do |a|
96
- href = a.attr('href')
95
+ def replace_links(ps, res, lang)
96
+ ps.css("a[href]").each do |a|
97
+ href = a.attr("href")
97
98
 
98
- href = href.gsub(%r'\Ahttps://www.bipm.org/', '')
99
+ href = href.gsub(%r'\Ahttps://www\.bipm\.org/', "")
99
100
 
100
101
  # Correct links
101
- href = href.gsub('/web/guest/', "/#{lang}/")
102
+ href = href.gsub("/web/guest/", "/#{lang}/")
102
103
 
103
104
  # Account for some mistakes from an upstream document
104
- href = href.gsub(%r"\A/jen/", '/en/')
105
- href = href.gsub(%r"\A/en/CGPM/jsp/", '/en/CGPM/db/')
105
+ href = href.gsub(%r"\A/jen/", "/en/")
106
+ href = href.gsub(%r"\A/en/CGPM/jsp/", "/en/CGPM/db/")
106
107
 
107
108
  href = case href
108
- when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
109
- %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
110
- %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
111
- "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
112
- when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
113
- "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
114
- when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
115
- "cipm-decisions:#{$1}/#{$2}#{$3}"
116
- else
117
- URI(res.uri).merge(href).to_s # Relative -> absolute
118
- end
109
+ when %r'\A/(\w{2})/CGPM/db/(\d+)/(\d+)/(#.*)?\z',
110
+ %r'\A/jsp/(\w{2})/ViewCGPMResolution\.jsp\?CGPM=(\d+)&RES=(\d+)(#.*)?\z',
111
+ %r'\A/(\w{2})/committees/cg/cgpm/(\d+)-\d+/resolution-(\d+)(#.*)?\z',
112
+ "cgpm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
113
+ when %r'\A/(\w{2})/CIPM/db/(\d+)/(\d+)/(#.*)?\z'
114
+ "cipm-resolution:#{$1}/#{$2}/#{$3}#{$4}"
115
+ when %r'\A/(\w{2})/committees/cipm/meeting/([0-9()I]+).html(#.*)?\z'
116
+ "cipm-decisions:#{$1}/#{$2}#{$3}"
117
+ else
118
+ URI(res.uri).merge(href).to_s # Relative -> absolute
119
+ end
119
120
 
120
- a.set_attribute('href', href)
121
+ a.set_attribute("href", href)
121
122
  end
122
123
  end
123
124
 
124
- def replace_centers ps
125
- centers = ps.css('center').to_a
125
+ def replace_centers(ps)
126
+ centers = ps.css("center").to_a
126
127
  while centers.length > 0
127
128
  center = centers.first
128
129
  current = center
@@ -131,7 +132,7 @@ module Bipm
131
132
  break unless current.next
132
133
  while Nokogiri::XML::Text === current.next
133
134
  current = current.next
134
- break if current.text.strip != ''
135
+ break if current.text.strip != ""
135
136
  end
136
137
  break unless current.next
137
138
  break unless current.next.name == "center"
@@ -157,47 +158,52 @@ module Bipm
157
158
  end
158
159
 
159
160
  # Remove the remaining centers
160
- ps.css('center').each do |i|
161
+ ps.css("center").each do |i|
161
162
  i.replace i.inner_html
162
163
  end
163
164
  end
164
165
 
165
- def format_message part
166
+ def format_message(part)
166
167
  AsciiMath.asciidoc_extract_math(
167
- ReverseAdoc.convert(part).strip.gsub("&nbsp;", ' ').gsub(" \n", "\n")
168
+ Coradoc::Input::HTML.convert(part).strip.gsub("&nbsp;", " ").gsub(" \n", "\n")
168
169
  )
170
+ rescue
171
+ warn "Bug in Coradoc, couldn't parse the following document:"
172
+ pp part
173
+ warn "Please report this as an issue to https://github.com/metanorma/coradoc"
174
+ raise
169
175
  end
170
176
 
171
- def ng_to_string ps
172
- ps.inner_html.encode('utf-8').gsub("\r", '').gsub(%r'</?nobr>','')
177
+ def ng_to_string(ps)
178
+ ps.inner_html.encode("utf-8").gsub("\r", "").gsub(%r'</?nobr>', "")
173
179
  end
174
180
 
175
- def parse_resolution res, res_id, date, type = :cgpm, lang = 'en', rec_type = nil
181
+ def parse_resolution(res, res_id, date, type = :cgpm, lang = "en", rec_type = nil)
176
182
  # Reparse the document after fixing upstream syntax
177
183
  fixed_body = res.body.gsub("<name=", "<a name=")
178
- fixed_body = fixed_body.force_encoding('utf-8')
179
- fixed_body = fixed_body.gsub('&Eacute;', 'É')
180
- fixed_body = fixed_body.gsub('&#171;&#032;', '« ')
181
- fixed_body = fixed_body.gsub('&#032;&#187;', ' »')
182
- fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, '')
184
+ fixed_body = fixed_body.force_encoding("utf-8")
185
+ fixed_body = fixed_body.gsub("&Eacute;", "É")
186
+ fixed_body = fixed_body.gsub("&#171;&#032;", "« ")
187
+ fixed_body = fixed_body.gsub("&#032;&#187;", " »")
188
+ fixed_body = fixed_body.sub(%r'<h1>(.*?)</h1>'m, "")
183
189
  supertitle = $1.strip
184
- fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, '')
190
+ fixed_body = fixed_body.sub(%r'<h2>(.*?)</h2>'m, "")
185
191
  title = $1.strip
186
192
  fixed_body = fixed_body.sub(/(="web-content">)\s*<p>\s*(<p)/, '\1\2')
187
193
  fixed_body = fixed_body.gsub(%r"<a name=\"haut\">(.*?)</a>"m, '\1')
188
194
  ng = Nokogiri::HTML(fixed_body, res.uri.to_s, "utf-8", Nokogiri::XML::ParseOptions.new.default_html.noent)
189
195
 
190
- refs = ng.css('.publication-card_reference a')
196
+ refs = ng.css(".publication-card_reference a")
191
197
 
192
198
  if rec_type.end_with? "?"
193
199
  rec_type = case supertitle
194
- when /\AD[eé]claration/
195
- "declaration"
196
- when /\AR[eé]solution/
197
- "resolution"
198
- else
199
- rec_type[..-2]
200
- end
200
+ when /\AD[eé]claration/
201
+ "statement"
202
+ when /\AR[eé]solution/
203
+ "resolution"
204
+ else
205
+ rec_type[..-2]
206
+ end
201
207
  end
202
208
 
203
209
  r = {
@@ -220,7 +226,7 @@ module Bipm
220
226
  r.delete("type") unless r["type"]
221
227
 
222
228
  if refs.length > 0
223
- r["reference"] = res.uri.merge(refs.first.attr('href')).to_s.split('?').first
229
+ r["reference"] = res.uri.merge(refs.first.attr("href")).to_s.split("?").first
224
230
  name, page = refs.first.text.strip.split(/, p(?=[0-9])/)
225
231
  r["reference_name"] = name
226
232
  if page
@@ -234,7 +240,7 @@ module Bipm
234
240
  r.delete("reference_page")
235
241
  end
236
242
 
237
- ps = ng.css('div.journal-content-article').first
243
+ ps = ng.css("div.journal-content-article").first
238
244
 
239
245
  #binding.pry if ps.count != 1
240
246
 
@@ -247,6 +253,11 @@ module Bipm
247
253
  doc = Common.ng_to_string(ps)
248
254
  # doc = AsciiMath.html_to_asciimath(doc)
249
255
 
256
+ if doc.match? DOIREGEX
257
+ doc = doc.sub(DOIREGEX, "")
258
+ r["doi"] = $1
259
+ end
260
+
250
261
  parts = doc.split(/(\n(?:<p>)?<b>.*?<\/b>|\n<p><i>.*?<\/i>|<div class="bipm-lame-grey">|<h3>|<p>(?:après examen |après avoir entendu )|having noted that |decides to define |décide de définir |conformément à l'invitation|acting in accordance with|recommande que les résultats|(?:strongly |and further |)(?:considers|recommends|recommande) (?:la|that|que(?! « ))|estime que|declares<\/p>|déclare :<\/b><\/p>|<a name="_ftn\d)/)
251
262
  nparts = [parts.shift]
252
263
  while parts.length > 0
@@ -267,13 +278,13 @@ module Bipm
267
278
  next
268
279
  end
269
280
 
270
- if parse.start_with? 'NOTE'
271
- part = part.sub('<h3>NOTE</h3>', '')
281
+ if parse.start_with? "NOTE"
282
+ part = part.sub("<h3>NOTE</h3>", "")
272
283
  r["notes"] = Common.format_message(part)
273
284
  next
274
285
  end
275
286
 
276
- CONSIDERATIONS.any? do |k,v|
287
+ CONSIDERATIONS.any? do |k, v|
277
288
  if parse =~ /\A#{PREFIX}#{k}\b/i
278
289
  r["considerations"] << prev = {
279
290
  "type" => v,
@@ -283,7 +294,7 @@ module Bipm
283
294
  end
284
295
  end && next
285
296
 
286
- ACTIONS.any? do |k,v|
297
+ ACTIONS.any? do |k, v|
287
298
  if parse =~ /\A#{PREFIX}#{k}\b/i
288
299
  r["actions"] << prev = {
289
300
  "type" => v,
@@ -314,13 +325,13 @@ module Bipm
314
325
  end
315
326
 
316
327
  %w[considerations actions].each do |type|
317
- map = type == 'actions' ? ACTIONS : CONSIDERATIONS
328
+ map = type == "actions" ? ACTIONS : CONSIDERATIONS
318
329
  r[type] = r[type].map do |i|
319
330
  islist = false
320
331
 
321
332
  kk = nil
322
333
 
323
- if map.any? { |k,v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
334
+ if map.any? { |k, v| (i["message"].split("\n").first =~ /\A\s*([*_]?)(#{PREFIX}#{k})\1?(#{SUFFIX})\1?\s*\z/i) && (kk = k) }
324
335
  prefix = $2
325
336
  suffix = $3
326
337
  subject = $4
@@ -347,15 +358,15 @@ module Bipm
347
358
 
348
359
  if subject
349
360
  #p subject
350
- r['subject'] ||= []
351
- r['subject'] << subject
361
+ r["subject"] ||= []
362
+ r["subject"] << subject
352
363
  end
353
364
 
354
365
  if islist
355
366
  suffix = suffix.strip
356
- suffix = nil if suffix == ''
367
+ suffix = nil if suffix == ""
357
368
  listitems.map do |li|
358
- i.merge 'message' => [prefix, suffix, li].compact.join(" ")
369
+ i.merge "message" => [prefix, suffix, li].compact.join(" ")
359
370
  end
360
371
  else
361
372
  i
@@ -363,13 +374,13 @@ module Bipm
363
374
  end.flatten
364
375
  end
365
376
 
366
- if r['subject']
367
- r['subject'] = r['subject'].uniq.join(" and ")
377
+ if r["subject"]
378
+ r["subject"] = r["subject"].uniq.join(" and ")
368
379
  end
369
380
 
370
381
  # Note: we replace the previously set r['subject'].
371
- r['subject'] = type.to_s.upcase.gsub("-", ' ')
372
- r['subject'] = 'CCDS' if type == :cctf && supertitle.include?("CCDS")
382
+ r["subject"] = type.to_s.upcase.gsub("-", " ")
383
+ r["subject"] = "CCDS" if type == :cctf && supertitle.include?("CCDS")
373
384
 
374
385
  r
375
386
  end
@@ -377,11 +388,11 @@ module Bipm
377
388
  def extract_pdf(meeting, lang)
378
389
  pdfs = meeting.css('a.title-third[href*=".pdf"]')
379
390
  .map { |i| i.attr("href") }
380
- .map { |i| i.split('?').first }
391
+ .map { |i| i.split("?").first }
381
392
  .select do |i|
382
- i.downcase.include?("-#{lang}.pdf") ||
383
- %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
384
- end
393
+ i.downcase.include?("-#{lang}.pdf") ||
394
+ %w[en fr].none? { |l| i.downcase.include? "-#{l}.pdf" }
395
+ end
385
396
 
386
397
  pdfs = pdfs.first if pdfs.length <= 1
387
398
 
@@ -389,8 +400,10 @@ module Bipm
389
400
  end
390
401
 
391
402
  def extract_date(date_str)
403
+ return nil unless date_str
404
+
392
405
  date = date_str.strip
393
- .gsub(/\s+/, ' ')
406
+ .gsub(/\s+/, " ")
394
407
  .gsub("février", "february") # 3 first letters must match English
395
408
  .gsub("juin", "june")
396
409
  .gsub("avril", "april")
@@ -411,7 +424,6 @@ module Bipm
411
424
 
412
425
  extend self
413
426
  end
414
-
415
427
  end
416
428
  end
417
429
  end
@@ -3,7 +3,7 @@
3
3
  module Bipm
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.1.3"
6
+ VERSION = "0.2.1"
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bipm-data-importer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-01-06 00:00:00.000000000 Z
11
+ date: 2024-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: reverse_adoc
42
+ name: coradoc
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -206,7 +206,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
206
  - !ruby/object:Gem::Version
207
207
  version: '0'
208
208
  requirements: []
209
- rubygems_version: 3.3.26
209
+ rubygems_version: 3.3.27
210
210
  signing_key:
211
211
  specification_version: 4
212
212
  summary: Importer for BIPM CGPM and CIPM content