ietf-data-importer 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "open-uri"
5
+
6
+ module Ietf
7
+ module Data
8
+ module Importer
9
+ module Scrapers
10
+ # Base class for web scrapers
11
+ class BaseScraper
12
+ # Fetch HTML content from a URL and parse it with Nokogiri
13
+ # @param url [String] The URL to fetch
14
+ # @return [Nokogiri::HTML::Document] The parsed HTML document
15
+ def fetch_html(url)
16
+ Nokogiri::HTML(URI.open(url))
17
+ rescue => e
18
+ puts " Error fetching URL #{url}: #{e.message}"
19
+ nil
20
+ end
21
+
22
+ # Log a message with indentation
23
+ # @param message [String] The message to log
24
+ # @param level [Integer] The indentation level (default: 0)
25
+ def log(message, level = 0)
26
+ indent = " " * level
27
+ puts "#{indent}#{message}"
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,272 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_scraper"
4
+ require_relative "../group_collection"
5
+
6
+ module Ietf
7
+ module Data
8
+ module Importer
9
+ module Scrapers
10
+ # Scraper for IETF groups from datatracker.ietf.org
11
+ class IetfScraper < BaseScraper
12
+ # Base URL for IETF datatracker
13
+ BASE_URL = "https://datatracker.ietf.org/group/"
14
+
15
+ # Fetch all IETF groups
16
+ # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
17
+ def fetch
18
+ groups = []
19
+ log "Fetching IETF groups..."
20
+
21
+ # Fetch all group types
22
+ group_types = fetch_group_types
23
+
24
+ # For each group type, fetch its groups
25
+ group_types.each do |type|
26
+ log "Fetching #{type[:name]} groups...", 1
27
+
28
+ # Skip if URL is empty
29
+ next if type[:url].nil? || type[:url].empty?
30
+
31
+ # Construct the full URL
32
+ type_url = if type[:url].start_with?('/')
33
+ "https://datatracker.ietf.org#{type[:url]}"
34
+ else
35
+ "https://datatracker.ietf.org/#{type[:url]}"
36
+ end
37
+ type_doc = fetch_html(type_url)
38
+ next unless type_doc
39
+
40
+ # Extract groups from the table
41
+ extract_groups_from_table(type_doc, type, groups)
42
+ end
43
+
44
+ groups
45
+ end
46
+
47
+ private
48
+
49
+ # Fetch all group types from the main IETF groups page
50
+ # @return [Array<Hash>] Array of group type information
51
+ def fetch_group_types
52
+ doc = fetch_html(BASE_URL)
53
+ return [] unless doc
54
+
55
+ log "Looking for group types on the page...", 1
56
+
57
+ # Extract group types from the table on the main page
58
+ group_types = []
59
+
60
+ # Try to find from the table first
61
+ doc.css('table.tablesorter tbody tr').each do |row|
62
+ type_cell = row.at_css('td a')
63
+ next unless type_cell && type_cell['href']
64
+
65
+ href = type_cell['href']
66
+ next unless href.include?('/')
67
+
68
+ type_abbr = href.sub(/\/$/, '').split('/').last
69
+ name = type_cell.text.strip
70
+
71
+ group_types << {
72
+ name: name,
73
+ abbreviation: type_abbr.downcase,
74
+ url: href
75
+ }
76
+ end
77
+
78
+ # If we didn't find any types in the table, use the predefined list
79
+ if group_types.empty?
80
+ log "Using predefined group types...", 1
81
+ standard_types = [
82
+ { name: "Working Group", abbreviation: "wg", url: "/wg/" },
83
+ { name: "Research Group", abbreviation: "rg", url: "/rg/" },
84
+ { name: "Area", abbreviation: "area", url: "/area/" },
85
+ { name: "Team", abbreviation: "team", url: "/team/" },
86
+ { name: "Program", abbreviation: "program", url: "/program/" },
87
+ { name: "Directorate", abbreviation: "dir", url: "/dir/" },
88
+ { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
89
+ { name: "BOF", abbreviation: "bof", url: "/bof/" }
90
+ ]
91
+ group_types = standard_types
92
+ end
93
+
94
+ log "Found #{group_types.size} group types: #{group_types.map { |t| t[:abbreviation] }.join(', ')}", 1
95
+ group_types
96
+ end
97
+
98
+ # Extract groups from a table on the group type page
99
+ # @param doc [Nokogiri::HTML::Document] The HTML document
100
+ # @param type [Hash] The group type information
101
+ # @param groups [Array<Ietf::Data::Importer::Group>] Array to add groups to
102
+ def extract_groups_from_table(doc, type, groups)
103
+ # Try different table selectors
104
+ selectors = [
105
+ '.group-list tbody tr', # Traditional format
106
+ 'table.table-sm tbody tr', # New table format
107
+ 'table.tablesorter tbody tr' # Another possible format
108
+ ]
109
+
110
+ rows = []
111
+ selectors.each do |selector|
112
+ found_rows = doc.css(selector)
113
+ if found_rows.any?
114
+ log "Found #{found_rows.size} groups using selector: #{selector}", 2
115
+ rows = found_rows
116
+ break
117
+ end
118
+ end
119
+
120
+ rows.each do |row|
121
+ # Try different selectors for finding the abbreviation and name
122
+ abbreviation = nil
123
+ name = nil
124
+
125
+ # First, try to find the abbreviation and name using standard classes
126
+ abbreviation ||= row.at_css('.acronym')&.text&.strip
127
+ name ||= row.at_css('.name')&.text&.strip
128
+
129
+ # If that doesn't work, try to find by column position
130
+ if abbreviation.nil? || name.nil?
131
+ # First column might be the abbreviation, second might be the name
132
+ cells = row.css('td')
133
+ if cells.size >= 2
134
+ abbreviation ||= cells[0].text.strip
135
+ name ||= cells[1].text.strip
136
+ end
137
+ end
138
+
139
+ # If we still don't have them, try to extract from links
140
+ if abbreviation.nil? || name.nil?
141
+ link = row.at_css('a')
142
+ if link
143
+ # Try to extract abbreviation from the URL
144
+ if link['href'] =~ %r{/([^/]+)/?$}
145
+ abbreviation ||= $1.upcase
146
+ end
147
+
148
+ # Use link text as the name
149
+ name ||= link.text.strip
150
+ end
151
+ end
152
+
153
+ # Skip if we still couldn't extract basic info
154
+ next unless abbreviation && name && !abbreviation.empty? && !name.empty?
155
+
156
+ # Extract other fields from the row
157
+ status = 'active' # Default to active
158
+
159
+ # Try to find status from row classes or content
160
+ status = 'concluded' if row['class'] && row['class'].include?('concluded')
161
+ status = 'concluded' if row.text.include?('Concluded')
162
+ status = 'active' if row.at_css('.active') || row.text.include?('Active')
163
+
164
+ # Try to find the area
165
+ area = nil
166
+ area_element = row.at_css('.area')
167
+ area = area_element.text.strip if area_element
168
+
169
+ # Get the group detail page URL
170
+ detail_link = row.at_css('a')
171
+ next unless detail_link
172
+
173
+ group_url = detail_link['href']
174
+ detail_url = URI.join(BASE_URL, group_url)
175
+
176
+ # Fetch additional details from the group's page
177
+ begin
178
+ details = fetch_group_details(detail_url)
179
+
180
+ # Create Group object
181
+ group = Importer::Group.new(
182
+ abbreviation: abbreviation,
183
+ name: name,
184
+ organization: 'ietf',
185
+ type: type[:abbreviation],
186
+ area: area,
187
+ status: status,
188
+ description: details[:description],
189
+ chairs: details[:chairs],
190
+ mailing_list: details[:mailing_list],
191
+ mailing_list_archive: details[:mailing_list_archive],
192
+ website_url: details[:website_url],
193
+ charter_url: details[:charter_url],
194
+ concluded_date: details[:concluded_date]
195
+ )
196
+
197
+ groups << group
198
+ rescue => e
199
+ log "Error fetching details for #{abbreviation}: #{e.message}", 2
200
+ end
201
+ end
202
+ end
203
+
204
+ # Fetch details for a specific group from its page
205
+ # @param url [String] The URL of the group's page
206
+ # @return [Hash] Hash of group details
207
+ def fetch_group_details(url)
208
+ details = {
209
+ description: nil,
210
+ chairs: [],
211
+ mailing_list: nil,
212
+ mailing_list_archive: nil,
213
+ website_url: nil,
214
+ charter_url: nil,
215
+ concluded_date: nil
216
+ }
217
+
218
+ doc = fetch_html(url)
219
+ return details unless doc
220
+
221
+ # Extract description from charter
222
+ charter_section = doc.at_css('#charter')
223
+ if charter_section
224
+ details[:description] = charter_section.text.strip
225
+ end
226
+
227
+ # Extract chairs
228
+ doc.css('.role-WG-chair, .role-RG-chair').each do |chair|
229
+ details[:chairs] << chair.text.strip
230
+ end
231
+
232
+ # Extract mailing list
233
+ mailing_list = doc.at_css('a[href^="mailto:"]')
234
+ if mailing_list
235
+ details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
236
+ end
237
+
238
+ # Extract mailing list archive
239
+ archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
240
+ if archive
241
+ details[:mailing_list_archive] = archive['href']
242
+ end
243
+
244
+ # Extract website if available
245
+ website = doc.at_css('.additional-urls a')
246
+ if website
247
+ details[:website_url] = website['href']
248
+ end
249
+
250
+ # Extract charter URL
251
+ charter_link = doc.at_css('a[href*="/charter/"]')
252
+ if charter_link
253
+ details[:charter_url] = URI.join("https://datatracker.ietf.org", charter_link['href']).to_s
254
+ end
255
+
256
+ # Extract concluded date
257
+ concluded_info = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
258
+ if concluded_info
259
+ begin
260
+ details[:concluded_date] = Date.parse(concluded_info[1])
261
+ rescue
262
+ # Just leave it as nil if we can't parse it
263
+ end
264
+ end
265
+
266
+ details
267
+ end
268
+ end
269
+ end
270
+ end
271
+ end
272
+ end
@@ -0,0 +1,350 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_scraper"
4
+ require_relative "../group_collection"
5
+
6
+ module Ietf
7
+ module Data
8
+ module Importer
9
+ module Scrapers
10
+ # Scraper for IRTF groups from irtf.org
11
+ class IrtfScraper < BaseScraper
12
+ # Base URL for IRTF website
13
+ BASE_URL = "https://www.irtf.org/groups.html"
14
+
15
+ # Fetch all IRTF groups
16
+ # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
17
+ def fetch
18
+ groups = []
19
+ log "Fetching IRTF groups..."
20
+
21
+ begin
22
+ doc = fetch_html(BASE_URL)
23
+ return [] unless doc
24
+
25
+ # First try to extract from the dropdown menu
26
+ dropdown_groups = extract_from_dropdown(doc)
27
+ if dropdown_groups.any?
28
+ log "Found #{dropdown_groups.size} groups in dropdown menu", 1
29
+ groups.concat(dropdown_groups)
30
+ return groups
31
+ end
32
+
33
+ # If dropdown extraction fails, fall back to traditional section-based extraction
34
+ # Debug the page structure
35
+ headings = doc.css('h3').map(&:text).join(', ')
36
+ log "Found headings on IRTF page: #{headings}", 1
37
+
38
+ # Extract active groups
39
+ active_groups = extract_groups(doc, 'Active Research Groups', 'active')
40
+ log "Found #{active_groups.size} active IRTF groups", 1
41
+
42
+ # Extract concluded groups
43
+ concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
44
+ log "Found #{concluded_groups.size} concluded IRTF groups", 1
45
+
46
+ groups.concat(active_groups)
47
+ groups.concat(concluded_groups)
48
+
49
+ # If still no groups found, try alternative selectors
50
+ if groups.empty?
51
+ log "No groups found with standard selectors, trying alternatives...", 1
52
+
53
+ # Try different section titles
54
+ ['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
55
+ section_groups = extract_groups(doc, title, 'active')
56
+ if section_groups.any?
57
+ log "Found #{section_groups.size} groups with section title: #{title}", 1
58
+ groups.concat(section_groups)
59
+ end
60
+ end
61
+
62
+ # Try a more generic approach if still no groups
63
+ if groups.empty?
64
+ log "Using generic list item selector...", 1
65
+ # Find any unordered list with links
66
+ doc.css('ul').each do |list|
67
+ if list.css('li a').any?
68
+ generic_groups = extract_groups_from_list(list, 'active')
69
+ if generic_groups.any?
70
+ log "Found #{generic_groups.size} groups using generic list selector", 1
71
+ groups.concat(generic_groups)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ rescue => e
78
+ log "Error fetching IRTF groups: #{e.message}", 1
79
+ end
80
+
81
+ groups
82
+ end
83
+
84
+ # Extract groups from the dropdown menu
85
+ # @param doc [Nokogiri::HTML::Document] The HTML document
86
+ # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
87
+ def extract_from_dropdown(doc)
88
+ groups = []
89
+
90
+ # Look for the dropdown menu containing research groups
91
+ dropdown = doc.css('a.dropdown-toggle').find do |el|
92
+ el.text.include?('Research Groups')
93
+ end
94
+
95
+ return [] unless dropdown
96
+
97
+ # Find the dropdown menu
98
+ dropdown_parent = dropdown.parent
99
+ dropdown_menu = dropdown_parent.css('.dropdown-menu')
100
+ return [] unless dropdown_menu.any?
101
+
102
+ log "Found dropdown menu with research groups", 1
103
+
104
+ # Extract groups from the dropdown menu
105
+ dropdown_menu.css('a.dropdown-item').each do |link|
106
+ next unless link && link['href']
107
+
108
+ name = link.text.strip
109
+ href = link['href']
110
+
111
+ # Extract abbreviation from href (e.g., cfrg.html -> CFRG)
112
+ if href =~ /(\w+)\.html$/
113
+ abbreviation = $1.upcase
114
+ else
115
+ next # Skip if we can't determine abbreviation
116
+ end
117
+
118
+ # Construct full URL if it's a relative path
119
+ details_url = href
120
+ if !details_url.start_with?('http')
121
+ if details_url.start_with?('/')
122
+ details_url = "https://www.irtf.org#{details_url}"
123
+ else
124
+ details_url = "https://www.irtf.org/#{details_url}"
125
+ end
126
+ end
127
+
128
+ begin
129
+ details = fetch_group_details(details_url)
130
+
131
+ group = Importer::Group.new(
132
+ abbreviation: abbreviation,
133
+ name: name,
134
+ organization: 'irtf',
135
+ type: 'rg',
136
+ area: nil,
137
+ status: 'active', # Assume active since it's in the menu
138
+ description: nil, # Will be populated from details page if available
139
+ chairs: details[:chairs],
140
+ mailing_list: details[:mailing_list],
141
+ mailing_list_archive: details[:mailing_list_archive],
142
+ website_url: details_url,
143
+ charter_url: details[:charter_url],
144
+ concluded_date: details[:concluded_date]
145
+ )
146
+
147
+ groups << group
148
+ rescue => e
149
+ log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
150
+ end
151
+ end
152
+
153
+ groups
154
+ end
155
+
156
+ private
157
+
158
+ # Extract groups from a section on the IRTF page
159
+ # @param doc [Nokogiri::HTML::Document] The HTML document
160
+ # @param section_title [String] The title of the section to extract from
161
+ # @param status [String] The status of the groups in this section (active/concluded)
162
+ # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
163
+ def extract_groups(doc, section_title, status)
164
+ groups = []
165
+ section = doc.xpath("//h3[contains(text(), '#{section_title}')]/following-sibling::ul[1]")
166
+
167
+ section.css('li').each do |group_item|
168
+ link = group_item.at_css('a')
169
+ next unless link
170
+
171
+ name = link.text.strip
172
+ abbreviation = nil
173
+
174
+ # Extract abbreviation from the text (typically in parentheses)
175
+ if name =~ /\(([^)]+)\)/
176
+ abbreviation = $1
177
+ end
178
+
179
+ # If unable to extract abbreviation, try from the URL
180
+ if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
181
+ abbreviation = $1.upcase
182
+ end
183
+
184
+ next unless abbreviation
185
+
186
+ # Extract description (text after the link)
187
+ description = group_item.text.sub(link.text, '').strip
188
+
189
+ # Remove parenthesized abbreviation from description
190
+ description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
191
+
192
+ # Get details from the group's page
193
+ details_url = link['href']
194
+ begin
195
+ details = fetch_group_details(details_url)
196
+
197
+ group = Importer::Group.new(
198
+ abbreviation: abbreviation,
199
+ name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
200
+ organization: 'irtf',
201
+ type: 'rg',
202
+ area: nil,
203
+ status: status,
204
+ description: description,
205
+ chairs: details[:chairs],
206
+ mailing_list: details[:mailing_list],
207
+ mailing_list_archive: details[:mailing_list_archive],
208
+ website_url: details_url,
209
+ charter_url: details[:charter_url],
210
+ concluded_date: details[:concluded_date]
211
+ )
212
+
213
+ groups << group
214
+ rescue => e
215
+ log "Error fetching details for #{abbreviation}: #{e.message}", 2
216
+ end
217
+ end
218
+
219
+ groups
220
+ end
221
+
222
+ # Helper method to extract groups from any list without requiring a specific section heading
223
+ # @param list_element [Nokogiri::XML::Element] The list element to extract from
224
+ # @param status [String] The status of the groups in this list (active/concluded)
225
+ # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
226
+ def extract_groups_from_list(list_element, status)
227
+ groups = []
228
+
229
+ list_element.css('li').each do |group_item|
230
+ link = group_item.at_css('a')
231
+ next unless link && link['href']
232
+
233
+ name = link.text.strip
234
+ abbreviation = nil
235
+
236
+ # Extract abbreviation from the text (typically in parentheses)
237
+ if name =~ /\(([^)]+)\)/
238
+ abbreviation = $1
239
+ end
240
+
241
+ # If unable to extract abbreviation, try from the URL
242
+ if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
243
+ abbreviation = $1.upcase
244
+ end
245
+
246
+ next unless abbreviation && !abbreviation.empty?
247
+
248
+ # Extract description (text after the link)
249
+ description = group_item.text.sub(link.text, '').strip
250
+
251
+ # Remove parenthesized abbreviation from description
252
+ description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
253
+
254
+ # Get details from the group's page
255
+ details_url = link['href']
256
+ # Ensure we have a full URL
257
+ if !details_url.start_with?('http')
258
+ if details_url.start_with?('/')
259
+ details_url = "https://www.irtf.org#{details_url}"
260
+ else
261
+ details_url = "https://www.irtf.org/#{details_url}"
262
+ end
263
+ end
264
+
265
+ begin
266
+ details = fetch_group_details(details_url)
267
+
268
+ group = Importer::Group.new(
269
+ abbreviation: abbreviation,
270
+ name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
271
+ organization: 'irtf',
272
+ type: 'rg',
273
+ area: nil,
274
+ status: status,
275
+ description: description,
276
+ chairs: details[:chairs],
277
+ mailing_list: details[:mailing_list],
278
+ mailing_list_archive: details[:mailing_list_archive],
279
+ website_url: details_url,
280
+ charter_url: details[:charter_url],
281
+ concluded_date: details[:concluded_date]
282
+ )
283
+
284
+ groups << group
285
+ rescue => e
286
+ log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
287
+ end
288
+ end
289
+
290
+ groups
291
+ end
292
+
293
+ # Fetch details for a specific IRTF group from its page
294
+ # @param url [String] The URL of the group's page
295
+ # @return [Hash] Hash of group details
296
+ def fetch_group_details(url)
297
+ details = {
298
+ chairs: [],
299
+ mailing_list: nil,
300
+ mailing_list_archive: nil,
301
+ charter_url: nil,
302
+ concluded_date: nil
303
+ }
304
+
305
+ doc = fetch_html(url)
306
+ return details unless doc
307
+
308
+ # Extract chairs
309
+ chair_section = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
310
+ if chair_section
311
+ details[:chairs] << chair_section.text.strip
312
+ end
313
+
314
+ # Extract mailing list
315
+ mailing_list = doc.at_css('a[href^="mailto:"]')
316
+ if mailing_list
317
+ details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
318
+ end
319
+
320
+ # Extract mailing list archive
321
+ archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
322
+ if archive
323
+ details[:mailing_list_archive] = archive['href']
324
+ end
325
+
326
+ # Extract charter URL
327
+ charter_link = doc.at_css('a[href*="charter"]')
328
+ if charter_link
329
+ details[:charter_url] = URI.join(url, charter_link['href']).to_s
330
+ end
331
+
332
+ # Extract concluded date from the page or the URL
333
+ if url.include?('/concluded/')
334
+ concluded_info = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
335
+ if concluded_info
336
+ begin
337
+ details[:concluded_date] = Date.parse(concluded_info[1])
338
+ rescue
339
+ # Just leave it as nil if we can't parse it
340
+ end
341
+ end
342
+ end
343
+
344
+ details
345
+ end
346
+ end
347
+ end
348
+ end
349
+ end
350
+ end