ietf-data-importer 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,347 +1,198 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "base_scraper"
4
- require_relative "../group_collection"
5
4
 
6
5
  module Ietf
7
6
  module Data
8
7
  module Importer
9
8
  module Scrapers
10
- # Scraper for IRTF groups from irtf.org
11
9
  class IrtfScraper < BaseScraper
12
- # Base URL for IRTF website
13
10
  BASE_URL = "https://www.irtf.org/groups.html"
14
11
 
15
- # Fetch all IRTF groups
16
- # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
12
+ SECTION_TITLES = [
13
+ "Active Research Groups",
14
+ "Current Research Groups",
15
+ "Research Groups",
16
+ "IRTF Groups",
17
+ ].freeze
18
+
17
19
  def fetch
18
- groups = []
19
20
  log "Fetching IRTF groups..."
20
21
 
21
- begin
22
- doc = fetch_html(BASE_URL)
23
- return [] unless doc
24
-
25
- # First try to extract from the dropdown menu
26
- dropdown_groups = extract_from_dropdown(doc)
27
- if dropdown_groups.any?
28
- log "Found #{dropdown_groups.size} groups in dropdown menu", 1
29
- groups.concat(dropdown_groups)
30
- return groups
31
- end
32
-
33
- # If dropdown extraction fails, fall back to traditional section-based extraction
34
- # Debug the page structure
35
- headings = doc.css('h3').map(&:text).join(', ')
36
- log "Found headings on IRTF page: #{headings}", 1
37
-
38
- # Extract active groups
39
- active_groups = extract_groups(doc, 'Active Research Groups', 'active')
40
- log "Found #{active_groups.size} active IRTF groups", 1
41
-
42
- # Extract concluded groups
43
- concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
44
- log "Found #{concluded_groups.size} concluded IRTF groups", 1
45
-
46
- groups.concat(active_groups)
47
- groups.concat(concluded_groups)
48
-
49
- # If still no groups found, try alternative selectors
50
- if groups.empty?
51
- log "No groups found with standard selectors, trying alternatives...", 1
52
-
53
- # Try different section titles
54
- ['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
55
- section_groups = extract_groups(doc, title, 'active')
56
- if section_groups.any?
57
- log "Found #{section_groups.size} groups with section title: #{title}", 1
58
- groups.concat(section_groups)
59
- end
60
- end
61
-
62
- # Try a more generic approach if still no groups
63
- if groups.empty?
64
- log "Using generic list item selector...", 1
65
- # Find any unordered list with links
66
- doc.css('ul').each do |list|
67
- if list.css('li a').any?
68
- generic_groups = extract_groups_from_list(list, 'active')
69
- if generic_groups.any?
70
- log "Found #{generic_groups.size} groups using generic list selector", 1
71
- groups.concat(generic_groups)
72
- end
73
- end
74
- end
75
- end
76
- end
77
- rescue => e
78
- log "Error fetching IRTF groups: #{e.message}", 1
79
- end
22
+ doc = fetch_html(BASE_URL)
23
+ return build_collection([]) unless doc
24
+
25
+ groups = extract_from_dropdown(doc)
26
+ return build_collection(groups) if groups.any?
80
27
 
81
- groups
28
+ log "Dropdown extraction empty, falling back to section parsing", 1
29
+ build_collection(extract_from_sections(doc))
30
+ rescue StandardError => e
31
+ log "Error fetching IRTF groups: #{e.message}", 1
32
+ build_collection([])
82
33
  end
83
34
 
84
- # Extract groups from the dropdown menu
85
- # @param doc [Nokogiri::HTML::Document] The HTML document
86
- # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
87
- def extract_from_dropdown(doc)
88
- groups = []
35
+ private
89
36
 
90
- # Look for the dropdown menu containing research groups
91
- dropdown = doc.css('a.dropdown-toggle').find do |el|
92
- el.text.include?('Research Groups')
37
+ def extract_from_dropdown(doc)
38
+ dropdown = doc.css("a.dropdown-toggle").find do |el|
39
+ el.text.include?("Research Groups")
93
40
  end
94
-
95
41
  return [] unless dropdown
96
42
 
97
- # Find the dropdown menu
98
- dropdown_parent = dropdown.parent
99
- dropdown_menu = dropdown_parent.css('.dropdown-menu')
100
- return [] unless dropdown_menu.any?
43
+ menu = dropdown.parent.css(".dropdown-menu")
44
+ return [] unless menu.any?
101
45
 
102
46
  log "Found dropdown menu with research groups", 1
103
47
 
104
- # Extract groups from the dropdown menu
105
- dropdown_menu.css('a.dropdown-item').each do |link|
106
- next unless link && link['href']
48
+ menu.css("a.dropdown-item").filter_map do |link|
49
+ next unless link && link["href"]
107
50
 
108
- name = link.text.strip
109
- href = link['href']
110
-
111
- # Extract abbreviation from href (e.g., cfrg.html -> CFRG)
112
- if href =~ /(\w+)\.html$/
113
- abbreviation = $1.upcase
114
- else
115
- next # Skip if we can't determine abbreviation
116
- end
117
-
118
- # Construct full URL if it's a relative path
119
- details_url = href
120
- if !details_url.start_with?('http')
121
- if details_url.start_with?('/')
122
- details_url = "https://www.irtf.org#{details_url}"
123
- else
124
- details_url = "https://www.irtf.org/#{details_url}"
125
- end
126
- end
127
-
128
- begin
129
- details = fetch_group_details(details_url)
130
-
131
- group = Importer::Group.new(
132
- abbreviation: abbreviation,
133
- name: name,
134
- organization: 'irtf',
135
- type: 'rg',
136
- area: nil,
137
- status: 'active', # Assume active since it's in the menu
138
- description: nil, # Will be populated from details page if available
139
- chairs: details[:chairs],
140
- mailing_list: details[:mailing_list],
141
- mailing_list_archive: details[:mailing_list_archive],
142
- website_url: details_url,
143
- charter_url: details[:charter_url],
144
- concluded_date: details[:concluded_date]
145
- )
146
-
147
- groups << group
148
- rescue => e
149
- log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
150
- end
151
- end
51
+ abbreviation = extract_abbreviation_from_href(link["href"])
52
+ next unless abbreviation
152
53
 
153
- groups
54
+ details_url = resolve_url(link["href"])
55
+ details = fetch_group_details(details_url)
56
+
57
+ build_group(
58
+ abbreviation: abbreviation,
59
+ name: link.text.strip,
60
+ organization: "irtf",
61
+ type: "rg",
62
+ status: "active",
63
+ website_url: details_url,
64
+ **details,
65
+ )
66
+ rescue StandardError => e
67
+ log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
68
+ 2
69
+ nil
70
+ end
154
71
  end
155
72
 
156
- private
73
+ def extract_from_sections(doc)
74
+ log "Found headings: #{doc.css('h3').map(&:text).join(', ')}", 1
157
75
 
158
- # Extract groups from a section on the IRTF page
159
- # @param doc [Nokogiri::HTML::Document] The HTML document
160
- # @param section_title [String] The title of the section to extract from
161
- # @param status [String] The status of the groups in this section (active/concluded)
162
- # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
163
- def extract_groups(doc, section_title, status)
164
- groups = []
165
- section = doc.xpath("//h3[contains(text(), '#{section_title}')]/following-sibling::ul[1]")
76
+ active = extract_from_section(doc, "Active Research Groups",
77
+ "active")
78
+ log "Found #{active.size} active IRTF groups", 1
166
79
 
167
- section.css('li').each do |group_item|
168
- link = group_item.at_css('a')
169
- next unless link
80
+ concluded = extract_from_section(doc, "Concluded Research Groups",
81
+ "concluded")
82
+ log "Found #{concluded.size} concluded IRTF groups", 1
170
83
 
171
- name = link.text.strip
172
- abbreviation = nil
84
+ groups = active + concluded
85
+ return groups if groups.any?
173
86
 
174
- # Extract abbreviation from the text (typically in parentheses)
175
- if name =~ /\(([^)]+)\)/
176
- abbreviation = $1
177
- end
178
-
179
- # If unable to extract abbreviation, try from the URL
180
- if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
181
- abbreviation = $1.upcase
182
- end
87
+ log "No groups found with standard selectors, trying alternatives...",
88
+ 1
89
+ extract_from_fallback_sections(doc)
90
+ end
183
91
 
184
- next unless abbreviation
92
+ def extract_from_section(doc, title, status)
93
+ section = doc.xpath("//h3[contains(text(), '#{title}')]/following-sibling::ul[1]")
94
+ extract_groups_from_list(section, status)
95
+ end
185
96
 
186
- # Extract description (text after the link)
187
- description = group_item.text.sub(link.text, '').strip
188
-
189
- # Remove parenthesized abbreviation from description
190
- description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
191
-
192
- # Get details from the group's page
193
- details_url = link['href']
194
- begin
195
- details = fetch_group_details(details_url)
196
-
197
- group = Importer::Group.new(
198
- abbreviation: abbreviation,
199
- name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
200
- organization: 'irtf',
201
- type: 'rg',
202
- area: nil,
203
- status: status,
204
- description: description,
205
- chairs: details[:chairs],
206
- mailing_list: details[:mailing_list],
207
- mailing_list_archive: details[:mailing_list_archive],
208
- website_url: details_url,
209
- charter_url: details[:charter_url],
210
- concluded_date: details[:concluded_date]
211
- )
212
-
213
- groups << group
214
- rescue => e
215
- log "Error fetching details for #{abbreviation}: #{e.message}", 2
216
- end
97
+ def extract_from_fallback_sections(doc)
98
+ SECTION_TITLES.each do |title|
99
+ groups = extract_from_section(doc, title, "active")
100
+ return groups if groups.any?
217
101
  end
218
102
 
219
- groups
103
+ doc.css("ul").flat_map do |list|
104
+ next [] unless list.css("li a").any?
105
+
106
+ extract_groups_from_list(list, "active")
107
+ end
220
108
  end
221
109
 
222
- # Helper method to extract groups from any list without requiring a specific section heading
223
- # @param list_element [Nokogiri::XML::Element] The list element to extract from
224
- # @param status [String] The status of the groups in this list (active/concluded)
225
- # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
226
110
  def extract_groups_from_list(list_element, status)
227
- groups = []
228
-
229
- list_element.css('li').each do |group_item|
230
- link = group_item.at_css('a')
231
- next unless link && link['href']
111
+ list_element.css("li").filter_map do |item|
112
+ link = item.at_css("a")
113
+ next unless link && link["href"]
232
114
 
233
115
  name = link.text.strip
234
- abbreviation = nil
235
-
236
- # Extract abbreviation from the text (typically in parentheses)
237
- if name =~ /\(([^)]+)\)/
238
- abbreviation = $1
239
- end
240
-
241
- # If unable to extract abbreviation, try from the URL
242
- if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
243
- abbreviation = $1.upcase
244
- end
245
-
246
- next unless abbreviation && !abbreviation.empty?
247
-
248
- # Extract description (text after the link)
249
- description = group_item.text.sub(link.text, '').strip
250
-
251
- # Remove parenthesized abbreviation from description
252
- description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
253
-
254
- # Get details from the group's page
255
- details_url = link['href']
256
- # Ensure we have a full URL
257
- if !details_url.start_with?('http')
258
- if details_url.start_with?('/')
259
- details_url = "https://www.irtf.org#{details_url}"
260
- else
261
- details_url = "https://www.irtf.org/#{details_url}"
262
- end
263
- end
264
-
265
- begin
266
- details = fetch_group_details(details_url)
267
-
268
- group = Importer::Group.new(
269
- abbreviation: abbreviation,
270
- name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
271
- organization: 'irtf',
272
- type: 'rg',
273
- area: nil,
274
- status: status,
275
- description: description,
276
- chairs: details[:chairs],
277
- mailing_list: details[:mailing_list],
278
- mailing_list_archive: details[:mailing_list_archive],
279
- website_url: details_url,
280
- charter_url: details[:charter_url],
281
- concluded_date: details[:concluded_date]
282
- )
283
-
284
- groups << group
285
- rescue => e
286
- log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
287
- end
116
+ abbreviation = extract_abbreviation(name, link["href"])
117
+ next unless abbreviation
118
+
119
+ description = extract_description(item, link)
120
+ details_url = resolve_url(link["href"])
121
+ details = fetch_group_details(details_url)
122
+
123
+ build_group(
124
+ abbreviation: abbreviation,
125
+ name: name.sub(/\s*\([^)]+\)\s*/, "").strip,
126
+ organization: "irtf",
127
+ type: "rg",
128
+ status: status,
129
+ description: description,
130
+ website_url: details_url,
131
+ **details,
132
+ )
133
+ rescue StandardError => e
134
+ log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
135
+ 2
136
+ nil
288
137
  end
138
+ end
289
139
 
290
- groups
140
+ def extract_abbreviation(name, href)
141
+ if name =~ /\(([^)]+)\)/
142
+ $1
143
+ elsif href =~ %r{/(\w+)/?$}
144
+ $1.upcase
145
+ end
291
146
  end
292
147
 
293
- # Fetch details for a specific IRTF group from its page
294
- # @param url [String] The URL of the group's page
295
- # @return [Hash] Hash of group details
296
- def fetch_group_details(url)
297
- details = {
298
- chairs: [],
299
- mailing_list: nil,
300
- mailing_list_archive: nil,
301
- charter_url: nil,
302
- concluded_date: nil
303
- }
148
+ def extract_abbreviation_from_href(href)
149
+ $1.upcase if href =~ /(\w+)\.html$/
150
+ end
304
151
 
305
- doc = fetch_html(url)
306
- return details unless doc
152
+ def extract_description(item, link)
153
+ item.text.sub(link.text, "").sub(/\s*\([^)]+\)\s*/, " ").strip
154
+ end
307
155
 
308
- # Extract chairs
309
- chair_section = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
310
- if chair_section
311
- details[:chairs] << chair_section.text.strip
156
+ def resolve_url(href)
157
+ case href
158
+ when %r{\Ahttps?://} then href
159
+ when %r{\A/} then "https://www.irtf.org#{href}"
160
+ else "https://www.irtf.org/#{href}"
312
161
  end
162
+ end
313
163
 
314
- # Extract mailing list
315
- mailing_list = doc.at_css('a[href^="mailto:"]')
316
- if mailing_list
317
- details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
318
- end
164
+ def fetch_group_details(url)
165
+ doc = fetch_html(url)
166
+ return {} unless doc
167
+
168
+ {
169
+ chairs: extract_chairs(doc),
170
+ mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
171
+ "mailto:", ""
172
+ ),
173
+ mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
174
+ charter_url: extract_charter_url(doc, url),
175
+ concluded_date: extract_concluded_date(doc, url),
176
+ }
177
+ end
319
178
 
320
- # Extract mailing list archive
321
- archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
322
- if archive
323
- details[:mailing_list_archive] = archive['href']
324
- end
179
+ def extract_chairs(doc)
180
+ chair = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
181
+ chair ? [chair.text.strip] : []
182
+ end
325
183
 
326
- # Extract charter URL
327
- charter_link = doc.at_css('a[href*="charter"]')
328
- if charter_link
329
- details[:charter_url] = URI.join(url, charter_link['href']).to_s
330
- end
184
+ def extract_charter_url(doc, base_url)
185
+ link = doc.at_css('a[href*="charter"]')
186
+ URI.join(base_url, link["href"]).to_s if link
187
+ end
331
188
 
332
- # Extract concluded date from the page or the URL
333
- if url.include?('/concluded/')
334
- concluded_info = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
335
- if concluded_info
336
- begin
337
- details[:concluded_date] = Date.parse(concluded_info[1])
338
- rescue
339
- # Just leave it as nil if we can't parse it
340
- end
341
- end
342
- end
189
+ def extract_concluded_date(doc, url)
190
+ return nil unless url.include?("/concluded/")
343
191
 
344
- details
192
+ match = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
193
+ Date.parse(match[1]) if match
194
+ rescue Date::Error
195
+ nil
345
196
  end
346
197
  end
347
198
  end
@@ -3,61 +3,33 @@
3
3
  require_relative "scrapers/base_scraper"
4
4
  require_relative "scrapers/ietf_scraper"
5
5
  require_relative "scrapers/irtf_scraper"
6
- require_relative "group_collection"
7
6
 
8
7
  module Ietf
9
8
  module Data
10
9
  module Importer
11
- # Module for IETF/IRTF web scrapers
12
10
  module Scrapers
13
- # Fetch all IETF and IRTF groups
14
- # @return [Ietf::Data::Importer::GroupCollection] Collection of all groups
15
11
  def self.fetch_all
16
12
  puts "Starting to fetch IETF and IRTF group data..."
17
13
 
18
- # Fetch IETF groups
19
- ietf_groups = IetfScraper.new.fetch
20
- puts "Fetched #{ietf_groups.size} IETF groups"
14
+ ietf = fetch_ietf
15
+ puts "Fetched #{ietf.size} IETF groups"
21
16
 
22
- # Fetch IRTF groups
23
- irtf_groups = IrtfScraper.new.fetch
24
- puts "Fetched #{irtf_groups.size} IRTF groups"
17
+ irtf = fetch_irtf
18
+ puts "Fetched #{irtf.size} IRTF groups"
25
19
 
26
- # Combine all groups and return as a collection
27
- all_groups = ietf_groups + irtf_groups
28
- puts "Total: #{all_groups.size} groups"
20
+ merged = ietf.merge(irtf)
21
+ puts "Total: #{merged.size} groups"
29
22
 
30
- Importer::GroupCollection.new(groups: all_groups)
23
+ merged
31
24
  end
32
25
 
33
- # Fetch IETF groups only
34
- # @return [Array<Ietf::Data::Importer::Group>] Array of IETF groups
35
26
  def self.fetch_ietf
36
27
  IetfScraper.new.fetch
37
28
  end
38
29
 
39
- # Fetch IRTF groups only
40
- # @return [Array<Ietf::Data::Importer::Group>] Array of IRTF groups
41
30
  def self.fetch_irtf
42
31
  IrtfScraper.new.fetch
43
32
  end
44
-
45
- # Save group collection to a file
46
- # @param collection [Ietf::Data::Importer::GroupCollection] Group collection to save
47
- # @param file_path [String] Path to the output file
48
- # @param format [Symbol] Output format (:yaml or :json)
49
- def self.save_to_file(collection, file_path, format = :yaml)
50
- case format.to_sym
51
- when :yaml
52
- File.write(file_path, collection.to_yaml)
53
- when :json
54
- File.write(file_path, collection.to_json)
55
- else
56
- raise ArgumentError, "Unsupported format: #{format}"
57
- end
58
-
59
- puts "Saved #{collection.groups.size} groups to #{file_path}"
60
- end
61
33
  end
62
34
  end
63
35
  end
@@ -3,7 +3,7 @@
3
3
  module Ietf
4
4
  module Data
5
5
  module Importer
6
- VERSION = "0.3.0"
6
+ VERSION = "0.3.1"
7
7
  end
8
8
  end
9
9
  end