ietf-data-importer 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,269 +1,193 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "base_scraper"
4
- require_relative "../group_collection"
5
4
 
6
5
  module Ietf
7
6
  module Data
8
7
  module Importer
9
8
  module Scrapers
10
- # Scraper for IETF groups from datatracker.ietf.org
11
9
  class IetfScraper < BaseScraper
12
- # Base URL for IETF datatracker
13
10
  BASE_URL = "https://datatracker.ietf.org/group/"
14
11
 
15
- # Fetch all IETF groups
16
- # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
12
+ STANDARD_TYPES = [
13
+ { name: "Working Group", abbreviation: "wg", url: "/wg/" },
14
+ { name: "Research Group", abbreviation: "rg", url: "/rg/" },
15
+ { name: "Area", abbreviation: "area", url: "/area/" },
16
+ { name: "Team", abbreviation: "team", url: "/team/" },
17
+ { name: "Program", abbreviation: "program", url: "/program/" },
18
+ { name: "Directorate", abbreviation: "dir", url: "/dir/" },
19
+ { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
20
+ { name: "BOF", abbreviation: "bof", url: "/bof/" },
21
+ ].freeze
22
+
23
+ TABLE_SELECTORS = [
24
+ ".group-list tbody tr",
25
+ "table.table-sm tbody tr",
26
+ "table.tablesorter tbody tr",
27
+ ].freeze
28
+
17
29
  def fetch
18
- groups = []
19
30
  log "Fetching IETF groups..."
20
31
 
21
- # Fetch all group types
22
32
  group_types = fetch_group_types
23
33
 
24
- # For each group type, fetch its groups
25
- group_types.each do |type|
34
+ groups = group_types.flat_map do |type|
26
35
  log "Fetching #{type[:name]} groups...", 1
36
+ next [] if type[:url].nil? || type[:url].empty?
27
37
 
28
- # Skip if URL is empty
29
- next if type[:url].nil? || type[:url].empty?
30
-
31
- # Construct the full URL
32
- type_url = if type[:url].start_with?('/')
33
- "https://datatracker.ietf.org#{type[:url]}"
34
- else
35
- "https://datatracker.ietf.org/#{type[:url]}"
36
- end
38
+ type_url = resolve_url(type[:url])
37
39
  type_doc = fetch_html(type_url)
38
- next unless type_doc
40
+ next [] unless type_doc
39
41
 
40
- # Extract groups from the table
41
- extract_groups_from_table(type_doc, type, groups)
42
+ extract_groups_from_table(type_doc, type)
42
43
  end
43
44
 
44
- groups
45
+ build_collection(groups)
45
46
  end
46
47
 
47
48
  private
48
49
 
49
- # Fetch all group types from the main IETF groups page
50
- # @return [Array<Hash>] Array of group type information
50
+ def resolve_url(path)
51
+ if path.start_with?("/")
52
+ "https://datatracker.ietf.org#{path}"
53
+ else
54
+ "https://datatracker.ietf.org/#{path}"
55
+ end
56
+ end
57
+
51
58
  def fetch_group_types
52
59
  doc = fetch_html(BASE_URL)
53
- return [] unless doc
60
+ return STANDARD_TYPES unless doc
54
61
 
55
62
  log "Looking for group types on the page...", 1
56
63
 
57
- # Extract group types from the table on the main page
58
- group_types = []
59
-
60
- # Try to find from the table first
61
- doc.css('table.tablesorter tbody tr').each do |row|
62
- type_cell = row.at_css('td a')
63
- next unless type_cell && type_cell['href']
64
+ discovered = discover_group_types(doc)
65
+ if discovered.empty?
66
+ log "Using predefined group types...", 1
67
+ STANDARD_TYPES
68
+ else
69
+ log "Found #{discovered.size} group types: #{discovered.map do |t|
70
+ t[:abbreviation]
71
+ end.join(', ')}", 1
72
+ discovered
73
+ end
74
+ end
64
75
 
65
- href = type_cell['href']
66
- next unless href.include?('/')
76
+ def discover_group_types(doc)
77
+ doc.css("table.tablesorter tbody tr").filter_map do |row|
78
+ type_cell = row.at_css("td a")
79
+ next unless type_cell && type_cell["href"]
67
80
 
68
- type_abbr = href.sub(/\/$/, '').split('/').last
69
- name = type_cell.text.strip
81
+ href = type_cell["href"]
82
+ next unless href.include?("/")
70
83
 
71
- group_types << {
72
- name: name,
73
- abbreviation: type_abbr.downcase,
74
- url: href
84
+ {
85
+ name: type_cell.text.strip,
86
+ abbreviation: href.sub(%r{/$}, "").split("/").last.downcase,
87
+ url: href,
75
88
  }
76
89
  end
77
-
78
- # If we didn't find any types in the table, use the predefined list
79
- if group_types.empty?
80
- log "Using predefined group types...", 1
81
- standard_types = [
82
- { name: "Working Group", abbreviation: "wg", url: "/wg/" },
83
- { name: "Research Group", abbreviation: "rg", url: "/rg/" },
84
- { name: "Area", abbreviation: "area", url: "/area/" },
85
- { name: "Team", abbreviation: "team", url: "/team/" },
86
- { name: "Program", abbreviation: "program", url: "/program/" },
87
- { name: "Directorate", abbreviation: "dir", url: "/dir/" },
88
- { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
89
- { name: "BOF", abbreviation: "bof", url: "/bof/" }
90
- ]
91
- group_types = standard_types
92
- end
93
-
94
- log "Found #{group_types.size} group types: #{group_types.map { |t| t[:abbreviation] }.join(', ')}", 1
95
- group_types
96
90
  end
97
91
 
98
- # Extract groups from a table on the group type page
99
- # @param doc [Nokogiri::HTML::Document] The HTML document
100
- # @param type [Hash] The group type information
101
- # @param groups [Array<Ietf::Data::Importer::Group>] Array to add groups to
102
- def extract_groups_from_table(doc, type, groups)
103
- # Try different table selectors
104
- selectors = [
105
- '.group-list tbody tr', # Traditional format
106
- 'table.table-sm tbody tr', # New table format
107
- 'table.tablesorter tbody tr' # Another possible format
108
- ]
109
-
110
- rows = []
111
- selectors.each do |selector|
112
- found_rows = doc.css(selector)
113
- if found_rows.any?
114
- log "Found #{found_rows.size} groups using selector: #{selector}", 2
115
- rows = found_rows
116
- break
117
- end
118
- end
119
-
120
- rows.each do |row|
121
- # Try different selectors for finding the abbreviation and name
122
- abbreviation = nil
123
- name = nil
124
-
125
- # First, try to find the abbreviation and name using standard classes
126
- abbreviation ||= row.at_css('.acronym')&.text&.strip
127
- name ||= row.at_css('.name')&.text&.strip
128
-
129
- # If that doesn't work, try to find by column position
130
- if abbreviation.nil? || name.nil?
131
- # First column might be the abbreviation, second might be the name
132
- cells = row.css('td')
133
- if cells.size >= 2
134
- abbreviation ||= cells[0].text.strip
135
- name ||= cells[1].text.strip
136
- end
137
- end
138
-
139
- # If we still don't have them, try to extract from links
140
- if abbreviation.nil? || name.nil?
141
- link = row.at_css('a')
142
- if link
143
- # Try to extract abbreviation from the URL
144
- if link['href'] =~ %r{/([^/]+)/?$}
145
- abbreviation ||= $1.upcase
146
- end
147
-
148
- # Use link text as the name
149
- name ||= link.text.strip
150
- end
151
- end
92
+ def extract_groups_from_table(doc, type)
93
+ rows = TABLE_SELECTORS.filter_map do |selector|
94
+ found = doc.css(selector)
95
+ found.any? ? found : nil
96
+ end.first || []
152
97
 
153
- # Skip if we still couldn't extract basic info
154
- next unless abbreviation && name && !abbreviation.empty? && !name.empty?
155
-
156
- # Extract other fields from the row
157
- status = 'active' # Default to active
158
-
159
- # Try to find status from row classes or content
160
- status = 'concluded' if row['class'] && row['class'].include?('concluded')
161
- status = 'concluded' if row.text.include?('Concluded')
162
- status = 'active' if row.at_css('.active') || row.text.include?('Active')
163
-
164
- # Try to find the area
165
- area = nil
166
- area_element = row.at_css('.area')
167
- area = area_element.text.strip if area_element
168
-
169
- # Get the group detail page URL
170
- detail_link = row.at_css('a')
171
- next unless detail_link
172
-
173
- group_url = detail_link['href']
174
- detail_url = URI.join(BASE_URL, group_url)
175
-
176
- # Fetch additional details from the group's page
177
- begin
178
- details = fetch_group_details(detail_url)
179
-
180
- # Create Group object
181
- group = Importer::Group.new(
182
- abbreviation: abbreviation,
183
- name: name,
184
- organization: 'ietf',
185
- type: type[:abbreviation],
186
- area: area,
187
- status: status,
188
- description: details[:description],
189
- chairs: details[:chairs],
190
- mailing_list: details[:mailing_list],
191
- mailing_list_archive: details[:mailing_list_archive],
192
- website_url: details[:website_url],
193
- charter_url: details[:charter_url],
194
- concluded_date: details[:concluded_date]
195
- )
196
-
197
- groups << group
198
- rescue => e
199
- log "Error fetching details for #{abbreviation}: #{e.message}", 2
200
- end
98
+ rows.filter_map do |row|
99
+ extract_group_from_row(row, type)
201
100
  end
202
101
  end
203
102
 
204
- # Fetch details for a specific group from its page
205
- # @param url [String] The URL of the group's page
206
- # @return [Hash] Hash of group details
207
- def fetch_group_details(url)
208
- details = {
209
- description: nil,
210
- chairs: [],
211
- mailing_list: nil,
212
- mailing_list_archive: nil,
213
- website_url: nil,
214
- charter_url: nil,
215
- concluded_date: nil
216
- }
103
+ def extract_group_from_row(row, type)
104
+ basic = extract_basic_info(row)
105
+ return nil unless basic[:abbreviation] && basic[:name]
106
+
107
+ status = determine_status(row)
108
+ area = row.at_css(".area")&.text&.strip
109
+
110
+ detail_link = row.at_css("a")
111
+ return nil unless detail_link
112
+
113
+ detail_url = URI.join(BASE_URL, detail_link["href"])
114
+ details = fetch_group_details(detail_url)
115
+
116
+ build_group(
117
+ abbreviation: basic[:abbreviation],
118
+ name: basic[:name],
119
+ organization: "ietf",
120
+ type: type[:abbreviation],
121
+ area: area,
122
+ status: status,
123
+ **details,
124
+ )
125
+ rescue StandardError => e
126
+ log "Error fetching details for #{basic&.dig(:abbreviation)}: #{e.message}",
127
+ 2
128
+ nil
129
+ end
217
130
 
218
- doc = fetch_html(url)
219
- return details unless doc
131
+ def extract_basic_info(row)
132
+ abbreviation = row.at_css(".acronym")&.text&.strip
133
+ name = row.at_css(".name")&.text&.strip
220
134
 
221
- # Extract description from charter
222
- charter_section = doc.at_css('#charter')
223
- if charter_section
224
- details[:description] = charter_section.text.strip
135
+ if abbreviation.nil? || name.nil?
136
+ cells = row.css("td")
137
+ if cells.size >= 2
138
+ abbreviation ||= cells[0].text.strip
139
+ name ||= cells[1].text.strip
140
+ end
225
141
  end
226
142
 
227
- # Extract chairs
228
- doc.css('.role-WG-chair, .role-RG-chair').each do |chair|
229
- details[:chairs] << chair.text.strip
143
+ if abbreviation.nil? || name.nil?
144
+ link = row.at_css("a")
145
+ if link
146
+ abbreviation ||= $1.upcase if link["href"] =~ %r{/([^/]+)/?$}
147
+ name ||= link.text.strip
148
+ end
230
149
  end
231
150
 
232
- # Extract mailing list
233
- mailing_list = doc.at_css('a[href^="mailto:"]')
234
- if mailing_list
235
- details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
236
- end
151
+ { abbreviation: abbreviation, name: name }
152
+ end
237
153
 
238
- # Extract mailing list archive
239
- archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
240
- if archive
241
- details[:mailing_list_archive] = archive['href']
242
- end
154
+ def determine_status(row)
155
+ return "concluded" if row["class"]&.include?("concluded")
156
+ return "concluded" if row.text.include?("Concluded")
157
+ return "active" if row.at_css(".active") || row.text.include?("Active")
243
158
 
244
- # Extract website if available
245
- website = doc.at_css('.additional-urls a')
246
- if website
247
- details[:website_url] = website['href']
248
- end
159
+ "active"
160
+ end
249
161
 
250
- # Extract charter URL
251
- charter_link = doc.at_css('a[href*="/charter/"]')
252
- if charter_link
253
- details[:charter_url] = URI.join("https://datatracker.ietf.org", charter_link['href']).to_s
254
- end
162
+ def fetch_group_details(url)
163
+ doc = fetch_html(url)
164
+ return {} unless doc
165
+
166
+ {
167
+ description: doc.at_css("#charter")&.text&.strip,
168
+ chairs: doc.css(".role-WG-chair, .role-RG-chair").map do |c|
169
+ c.text.strip
170
+ end,
171
+ mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
172
+ "mailto:", ""
173
+ ),
174
+ mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
175
+ website_url: doc.at_css(".additional-urls a")&.[]("href"),
176
+ charter_url: extract_charter_url(doc),
177
+ concluded_date: extract_concluded_date(doc),
178
+ }
179
+ end
255
180
 
256
- # Extract concluded date
257
- concluded_info = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
258
- if concluded_info
259
- begin
260
- details[:concluded_date] = Date.parse(concluded_info[1])
261
- rescue
262
- # Just leave it as nil if we can't parse it
263
- end
264
- end
181
+ def extract_charter_url(doc)
182
+ link = doc.at_css('a[href*="/charter/"]')
183
+ URI.join("https://datatracker.ietf.org", link["href"]).to_s if link
184
+ end
265
185
 
266
- details
186
+ def extract_concluded_date(doc)
187
+ match = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
188
+ Date.parse(match[1]) if match
189
+ rescue Date::Error
190
+ nil
267
191
  end
268
192
  end
269
193
  end