ietf-data-importer 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/check_update.yml +7 -7
- data/.gitignore +4 -0
- data/.rubocop.yml +8 -1
- data/.rubocop_todo.yml +49 -0
- data/CLAUDE.md +73 -0
- data/Gemfile +1 -2
- data/README.adoc +32 -24
- data/exe/ietf-data-importer +1 -1
- data/ietf-data-importer.gemspec +3 -2
- data/lib/ietf/data/importer/cli.rb +14 -23
- data/lib/ietf/data/importer/group.rb +39 -4
- data/lib/ietf/data/importer/group_collection.rb +101 -1
- data/lib/ietf/data/importer/scrapers/base_scraper.rb +18 -9
- data/lib/ietf/data/importer/scrapers/ietf_scraper.rb +137 -213
- data/lib/ietf/data/importer/scrapers/irtf_scraper.rb +142 -291
- data/lib/ietf/data/importer/scrapers.rb +7 -35
- data/lib/ietf/data/importer/version.rb +1 -1
- data/lib/ietf/data/importer.rb +56 -66
- metadata +14 -11
|
@@ -1,347 +1,198 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "base_scraper"
|
|
4
|
-
require_relative "../group_collection"
|
|
5
4
|
|
|
6
5
|
module Ietf
|
|
7
6
|
module Data
|
|
8
7
|
module Importer
|
|
9
8
|
module Scrapers
|
|
10
|
-
# Scraper for IRTF groups from irtf.org
|
|
11
9
|
class IrtfScraper < BaseScraper
|
|
12
|
-
# Base URL for IRTF website
|
|
13
10
|
BASE_URL = "https://www.irtf.org/groups.html"
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
SECTION_TITLES = [
|
|
13
|
+
"Active Research Groups",
|
|
14
|
+
"Current Research Groups",
|
|
15
|
+
"Research Groups",
|
|
16
|
+
"IRTF Groups",
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
17
19
|
def fetch
|
|
18
|
-
groups = []
|
|
19
20
|
log "Fetching IRTF groups..."
|
|
20
21
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
dropdown_groups = extract_from_dropdown(doc)
|
|
27
|
-
if dropdown_groups.any?
|
|
28
|
-
log "Found #{dropdown_groups.size} groups in dropdown menu", 1
|
|
29
|
-
groups.concat(dropdown_groups)
|
|
30
|
-
return groups
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# If dropdown extraction fails, fall back to traditional section-based extraction
|
|
34
|
-
# Debug the page structure
|
|
35
|
-
headings = doc.css('h3').map(&:text).join(', ')
|
|
36
|
-
log "Found headings on IRTF page: #{headings}", 1
|
|
37
|
-
|
|
38
|
-
# Extract active groups
|
|
39
|
-
active_groups = extract_groups(doc, 'Active Research Groups', 'active')
|
|
40
|
-
log "Found #{active_groups.size} active IRTF groups", 1
|
|
41
|
-
|
|
42
|
-
# Extract concluded groups
|
|
43
|
-
concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
|
|
44
|
-
log "Found #{concluded_groups.size} concluded IRTF groups", 1
|
|
45
|
-
|
|
46
|
-
groups.concat(active_groups)
|
|
47
|
-
groups.concat(concluded_groups)
|
|
48
|
-
|
|
49
|
-
# If still no groups found, try alternative selectors
|
|
50
|
-
if groups.empty?
|
|
51
|
-
log "No groups found with standard selectors, trying alternatives...", 1
|
|
52
|
-
|
|
53
|
-
# Try different section titles
|
|
54
|
-
['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
|
|
55
|
-
section_groups = extract_groups(doc, title, 'active')
|
|
56
|
-
if section_groups.any?
|
|
57
|
-
log "Found #{section_groups.size} groups with section title: #{title}", 1
|
|
58
|
-
groups.concat(section_groups)
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# Try a more generic approach if still no groups
|
|
63
|
-
if groups.empty?
|
|
64
|
-
log "Using generic list item selector...", 1
|
|
65
|
-
# Find any unordered list with links
|
|
66
|
-
doc.css('ul').each do |list|
|
|
67
|
-
if list.css('li a').any?
|
|
68
|
-
generic_groups = extract_groups_from_list(list, 'active')
|
|
69
|
-
if generic_groups.any?
|
|
70
|
-
log "Found #{generic_groups.size} groups using generic list selector", 1
|
|
71
|
-
groups.concat(generic_groups)
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
rescue => e
|
|
78
|
-
log "Error fetching IRTF groups: #{e.message}", 1
|
|
79
|
-
end
|
|
22
|
+
doc = fetch_html(BASE_URL)
|
|
23
|
+
return build_collection([]) unless doc
|
|
24
|
+
|
|
25
|
+
groups = extract_from_dropdown(doc)
|
|
26
|
+
return build_collection(groups) if groups.any?
|
|
80
27
|
|
|
81
|
-
|
|
28
|
+
log "Dropdown extraction empty, falling back to section parsing", 1
|
|
29
|
+
build_collection(extract_from_sections(doc))
|
|
30
|
+
rescue StandardError => e
|
|
31
|
+
log "Error fetching IRTF groups: #{e.message}", 1
|
|
32
|
+
build_collection([])
|
|
82
33
|
end
|
|
83
34
|
|
|
84
|
-
|
|
85
|
-
# @param doc [Nokogiri::HTML::Document] The HTML document
|
|
86
|
-
# @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
|
|
87
|
-
def extract_from_dropdown(doc)
|
|
88
|
-
groups = []
|
|
35
|
+
private
|
|
89
36
|
|
|
90
|
-
|
|
91
|
-
dropdown = doc.css(
|
|
92
|
-
el.text.include?(
|
|
37
|
+
def extract_from_dropdown(doc)
|
|
38
|
+
dropdown = doc.css("a.dropdown-toggle").find do |el|
|
|
39
|
+
el.text.include?("Research Groups")
|
|
93
40
|
end
|
|
94
|
-
|
|
95
41
|
return [] unless dropdown
|
|
96
42
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
dropdown_menu = dropdown_parent.css('.dropdown-menu')
|
|
100
|
-
return [] unless dropdown_menu.any?
|
|
43
|
+
menu = dropdown.parent.css(".dropdown-menu")
|
|
44
|
+
return [] unless menu.any?
|
|
101
45
|
|
|
102
46
|
log "Found dropdown menu with research groups", 1
|
|
103
47
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
next unless link && link['href']
|
|
48
|
+
menu.css("a.dropdown-item").filter_map do |link|
|
|
49
|
+
next unless link && link["href"]
|
|
107
50
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
# Extract abbreviation from href (e.g., cfrg.html -> CFRG)
|
|
112
|
-
if href =~ /(\w+)\.html$/
|
|
113
|
-
abbreviation = $1.upcase
|
|
114
|
-
else
|
|
115
|
-
next # Skip if we can't determine abbreviation
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# Construct full URL if it's a relative path
|
|
119
|
-
details_url = href
|
|
120
|
-
if !details_url.start_with?('http')
|
|
121
|
-
if details_url.start_with?('/')
|
|
122
|
-
details_url = "https://www.irtf.org#{details_url}"
|
|
123
|
-
else
|
|
124
|
-
details_url = "https://www.irtf.org/#{details_url}"
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
begin
|
|
129
|
-
details = fetch_group_details(details_url)
|
|
130
|
-
|
|
131
|
-
group = Importer::Group.new(
|
|
132
|
-
abbreviation: abbreviation,
|
|
133
|
-
name: name,
|
|
134
|
-
organization: 'irtf',
|
|
135
|
-
type: 'rg',
|
|
136
|
-
area: nil,
|
|
137
|
-
status: 'active', # Assume active since it's in the menu
|
|
138
|
-
description: nil, # Will be populated from details page if available
|
|
139
|
-
chairs: details[:chairs],
|
|
140
|
-
mailing_list: details[:mailing_list],
|
|
141
|
-
mailing_list_archive: details[:mailing_list_archive],
|
|
142
|
-
website_url: details_url,
|
|
143
|
-
charter_url: details[:charter_url],
|
|
144
|
-
concluded_date: details[:concluded_date]
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
groups << group
|
|
148
|
-
rescue => e
|
|
149
|
-
log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
|
|
150
|
-
end
|
|
151
|
-
end
|
|
51
|
+
abbreviation = extract_abbreviation_from_href(link["href"])
|
|
52
|
+
next unless abbreviation
|
|
152
53
|
|
|
153
|
-
|
|
54
|
+
details_url = resolve_url(link["href"])
|
|
55
|
+
details = fetch_group_details(details_url)
|
|
56
|
+
|
|
57
|
+
build_group(
|
|
58
|
+
abbreviation: abbreviation,
|
|
59
|
+
name: link.text.strip,
|
|
60
|
+
organization: "irtf",
|
|
61
|
+
type: "rg",
|
|
62
|
+
status: "active",
|
|
63
|
+
website_url: details_url,
|
|
64
|
+
**details,
|
|
65
|
+
)
|
|
66
|
+
rescue StandardError => e
|
|
67
|
+
log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
|
|
68
|
+
2
|
|
69
|
+
nil
|
|
70
|
+
end
|
|
154
71
|
end
|
|
155
72
|
|
|
156
|
-
|
|
73
|
+
def extract_from_sections(doc)
|
|
74
|
+
log "Found headings: #{doc.css('h3').map(&:text).join(', ')}", 1
|
|
157
75
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# @param status [String] The status of the groups in this section (active/concluded)
|
|
162
|
-
# @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
|
|
163
|
-
def extract_groups(doc, section_title, status)
|
|
164
|
-
groups = []
|
|
165
|
-
section = doc.xpath("//h3[contains(text(), '#{section_title}')]/following-sibling::ul[1]")
|
|
76
|
+
active = extract_from_section(doc, "Active Research Groups",
|
|
77
|
+
"active")
|
|
78
|
+
log "Found #{active.size} active IRTF groups", 1
|
|
166
79
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
80
|
+
concluded = extract_from_section(doc, "Concluded Research Groups",
|
|
81
|
+
"concluded")
|
|
82
|
+
log "Found #{concluded.size} concluded IRTF groups", 1
|
|
170
83
|
|
|
171
|
-
|
|
172
|
-
|
|
84
|
+
groups = active + concluded
|
|
85
|
+
return groups if groups.any?
|
|
173
86
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
# If unable to extract abbreviation, try from the URL
|
|
180
|
-
if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
|
|
181
|
-
abbreviation = $1.upcase
|
|
182
|
-
end
|
|
87
|
+
log "No groups found with standard selectors, trying alternatives...",
|
|
88
|
+
1
|
|
89
|
+
extract_from_fallback_sections(doc)
|
|
90
|
+
end
|
|
183
91
|
|
|
184
|
-
|
|
92
|
+
def extract_from_section(doc, title, status)
|
|
93
|
+
section = doc.xpath("//h3[contains(text(), '#{title}')]/following-sibling::ul[1]")
|
|
94
|
+
extract_groups_from_list(section, status)
|
|
95
|
+
end
|
|
185
96
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
|
|
191
|
-
|
|
192
|
-
# Get details from the group's page
|
|
193
|
-
details_url = link['href']
|
|
194
|
-
begin
|
|
195
|
-
details = fetch_group_details(details_url)
|
|
196
|
-
|
|
197
|
-
group = Importer::Group.new(
|
|
198
|
-
abbreviation: abbreviation,
|
|
199
|
-
name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
|
|
200
|
-
organization: 'irtf',
|
|
201
|
-
type: 'rg',
|
|
202
|
-
area: nil,
|
|
203
|
-
status: status,
|
|
204
|
-
description: description,
|
|
205
|
-
chairs: details[:chairs],
|
|
206
|
-
mailing_list: details[:mailing_list],
|
|
207
|
-
mailing_list_archive: details[:mailing_list_archive],
|
|
208
|
-
website_url: details_url,
|
|
209
|
-
charter_url: details[:charter_url],
|
|
210
|
-
concluded_date: details[:concluded_date]
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
groups << group
|
|
214
|
-
rescue => e
|
|
215
|
-
log "Error fetching details for #{abbreviation}: #{e.message}", 2
|
|
216
|
-
end
|
|
97
|
+
def extract_from_fallback_sections(doc)
|
|
98
|
+
SECTION_TITLES.each do |title|
|
|
99
|
+
groups = extract_from_section(doc, title, "active")
|
|
100
|
+
return groups if groups.any?
|
|
217
101
|
end
|
|
218
102
|
|
|
219
|
-
|
|
103
|
+
doc.css("ul").flat_map do |list|
|
|
104
|
+
next [] unless list.css("li a").any?
|
|
105
|
+
|
|
106
|
+
extract_groups_from_list(list, "active")
|
|
107
|
+
end
|
|
220
108
|
end
|
|
221
109
|
|
|
222
|
-
# Helper method to extract groups from any list without requiring a specific section heading
|
|
223
|
-
# @param list_element [Nokogiri::XML::Element] The list element to extract from
|
|
224
|
-
# @param status [String] The status of the groups in this list (active/concluded)
|
|
225
|
-
# @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
|
|
226
110
|
def extract_groups_from_list(list_element, status)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
link = group_item.at_css('a')
|
|
231
|
-
next unless link && link['href']
|
|
111
|
+
list_element.css("li").filter_map do |item|
|
|
112
|
+
link = item.at_css("a")
|
|
113
|
+
next unless link && link["href"]
|
|
232
114
|
|
|
233
115
|
name = link.text.strip
|
|
234
|
-
abbreviation =
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
details_url = link['href']
|
|
256
|
-
# Ensure we have a full URL
|
|
257
|
-
if !details_url.start_with?('http')
|
|
258
|
-
if details_url.start_with?('/')
|
|
259
|
-
details_url = "https://www.irtf.org#{details_url}"
|
|
260
|
-
else
|
|
261
|
-
details_url = "https://www.irtf.org/#{details_url}"
|
|
262
|
-
end
|
|
263
|
-
end
|
|
264
|
-
|
|
265
|
-
begin
|
|
266
|
-
details = fetch_group_details(details_url)
|
|
267
|
-
|
|
268
|
-
group = Importer::Group.new(
|
|
269
|
-
abbreviation: abbreviation,
|
|
270
|
-
name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
|
|
271
|
-
organization: 'irtf',
|
|
272
|
-
type: 'rg',
|
|
273
|
-
area: nil,
|
|
274
|
-
status: status,
|
|
275
|
-
description: description,
|
|
276
|
-
chairs: details[:chairs],
|
|
277
|
-
mailing_list: details[:mailing_list],
|
|
278
|
-
mailing_list_archive: details[:mailing_list_archive],
|
|
279
|
-
website_url: details_url,
|
|
280
|
-
charter_url: details[:charter_url],
|
|
281
|
-
concluded_date: details[:concluded_date]
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
groups << group
|
|
285
|
-
rescue => e
|
|
286
|
-
log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
|
|
287
|
-
end
|
|
116
|
+
abbreviation = extract_abbreviation(name, link["href"])
|
|
117
|
+
next unless abbreviation
|
|
118
|
+
|
|
119
|
+
description = extract_description(item, link)
|
|
120
|
+
details_url = resolve_url(link["href"])
|
|
121
|
+
details = fetch_group_details(details_url)
|
|
122
|
+
|
|
123
|
+
build_group(
|
|
124
|
+
abbreviation: abbreviation,
|
|
125
|
+
name: name.sub(/\s*\([^)]+\)\s*/, "").strip,
|
|
126
|
+
organization: "irtf",
|
|
127
|
+
type: "rg",
|
|
128
|
+
status: status,
|
|
129
|
+
description: description,
|
|
130
|
+
website_url: details_url,
|
|
131
|
+
**details,
|
|
132
|
+
)
|
|
133
|
+
rescue StandardError => e
|
|
134
|
+
log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
|
|
135
|
+
2
|
|
136
|
+
nil
|
|
288
137
|
end
|
|
138
|
+
end
|
|
289
139
|
|
|
290
|
-
|
|
140
|
+
def extract_abbreviation(name, href)
|
|
141
|
+
if name =~ /\(([^)]+)\)/
|
|
142
|
+
$1
|
|
143
|
+
elsif href =~ %r{/(\w+)/?$}
|
|
144
|
+
$1.upcase
|
|
145
|
+
end
|
|
291
146
|
end
|
|
292
147
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def fetch_group_details(url)
|
|
297
|
-
details = {
|
|
298
|
-
chairs: [],
|
|
299
|
-
mailing_list: nil,
|
|
300
|
-
mailing_list_archive: nil,
|
|
301
|
-
charter_url: nil,
|
|
302
|
-
concluded_date: nil
|
|
303
|
-
}
|
|
148
|
+
def extract_abbreviation_from_href(href)
|
|
149
|
+
$1.upcase if href =~ /(\w+)\.html$/
|
|
150
|
+
end
|
|
304
151
|
|
|
305
|
-
|
|
306
|
-
|
|
152
|
+
def extract_description(item, link)
|
|
153
|
+
item.text.sub(link.text, "").sub(/\s*\([^)]+\)\s*/, " ").strip
|
|
154
|
+
end
|
|
307
155
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
156
|
+
def resolve_url(href)
|
|
157
|
+
case href
|
|
158
|
+
when %r{\Ahttps?://} then href
|
|
159
|
+
when %r{\A/} then "https://www.irtf.org#{href}"
|
|
160
|
+
else "https://www.irtf.org/#{href}"
|
|
312
161
|
end
|
|
162
|
+
end
|
|
313
163
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
164
|
+
def fetch_group_details(url)
|
|
165
|
+
doc = fetch_html(url)
|
|
166
|
+
return {} unless doc
|
|
167
|
+
|
|
168
|
+
{
|
|
169
|
+
chairs: extract_chairs(doc),
|
|
170
|
+
mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
|
|
171
|
+
"mailto:", ""
|
|
172
|
+
),
|
|
173
|
+
mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
|
|
174
|
+
charter_url: extract_charter_url(doc, url),
|
|
175
|
+
concluded_date: extract_concluded_date(doc, url),
|
|
176
|
+
}
|
|
177
|
+
end
|
|
319
178
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
end
|
|
179
|
+
def extract_chairs(doc)
|
|
180
|
+
chair = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
|
|
181
|
+
chair ? [chair.text.strip] : []
|
|
182
|
+
end
|
|
325
183
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
if
|
|
329
|
-
|
|
330
|
-
end
|
|
184
|
+
def extract_charter_url(doc, base_url)
|
|
185
|
+
link = doc.at_css('a[href*="charter"]')
|
|
186
|
+
URI.join(base_url, link["href"]).to_s if link
|
|
187
|
+
end
|
|
331
188
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
concluded_info = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
|
|
335
|
-
if concluded_info
|
|
336
|
-
begin
|
|
337
|
-
details[:concluded_date] = Date.parse(concluded_info[1])
|
|
338
|
-
rescue
|
|
339
|
-
# Just leave it as nil if we can't parse it
|
|
340
|
-
end
|
|
341
|
-
end
|
|
342
|
-
end
|
|
189
|
+
def extract_concluded_date(doc, url)
|
|
190
|
+
return nil unless url.include?("/concluded/")
|
|
343
191
|
|
|
344
|
-
|
|
192
|
+
match = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
|
|
193
|
+
Date.parse(match[1]) if match
|
|
194
|
+
rescue Date::Error
|
|
195
|
+
nil
|
|
345
196
|
end
|
|
346
197
|
end
|
|
347
198
|
end
|
|
@@ -3,61 +3,33 @@
|
|
|
3
3
|
require_relative "scrapers/base_scraper"
|
|
4
4
|
require_relative "scrapers/ietf_scraper"
|
|
5
5
|
require_relative "scrapers/irtf_scraper"
|
|
6
|
-
require_relative "group_collection"
|
|
7
6
|
|
|
8
7
|
module Ietf
|
|
9
8
|
module Data
|
|
10
9
|
module Importer
|
|
11
|
-
# Module for IETF/IRTF web scrapers
|
|
12
10
|
module Scrapers
|
|
13
|
-
# Fetch all IETF and IRTF groups
|
|
14
|
-
# @return [Ietf::Data::Importer::GroupCollection] Collection of all groups
|
|
15
11
|
def self.fetch_all
|
|
16
12
|
puts "Starting to fetch IETF and IRTF group data..."
|
|
17
13
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
puts "Fetched #{ietf_groups.size} IETF groups"
|
|
14
|
+
ietf = fetch_ietf
|
|
15
|
+
puts "Fetched #{ietf.size} IETF groups"
|
|
21
16
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
puts "Fetched #{irtf_groups.size} IRTF groups"
|
|
17
|
+
irtf = fetch_irtf
|
|
18
|
+
puts "Fetched #{irtf.size} IRTF groups"
|
|
25
19
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
puts "Total: #{all_groups.size} groups"
|
|
20
|
+
merged = ietf.merge(irtf)
|
|
21
|
+
puts "Total: #{merged.size} groups"
|
|
29
22
|
|
|
30
|
-
|
|
23
|
+
merged
|
|
31
24
|
end
|
|
32
25
|
|
|
33
|
-
# Fetch IETF groups only
|
|
34
|
-
# @return [Array<Ietf::Data::Importer::Group>] Array of IETF groups
|
|
35
26
|
def self.fetch_ietf
|
|
36
27
|
IetfScraper.new.fetch
|
|
37
28
|
end
|
|
38
29
|
|
|
39
|
-
# Fetch IRTF groups only
|
|
40
|
-
# @return [Array<Ietf::Data::Importer::Group>] Array of IRTF groups
|
|
41
30
|
def self.fetch_irtf
|
|
42
31
|
IrtfScraper.new.fetch
|
|
43
32
|
end
|
|
44
|
-
|
|
45
|
-
# Save group collection to a file
|
|
46
|
-
# @param collection [Ietf::Data::Importer::GroupCollection] Group collection to save
|
|
47
|
-
# @param file_path [String] Path to the output file
|
|
48
|
-
# @param format [Symbol] Output format (:yaml or :json)
|
|
49
|
-
def self.save_to_file(collection, file_path, format = :yaml)
|
|
50
|
-
case format.to_sym
|
|
51
|
-
when :yaml
|
|
52
|
-
File.write(file_path, collection.to_yaml)
|
|
53
|
-
when :json
|
|
54
|
-
File.write(file_path, collection.to_json)
|
|
55
|
-
else
|
|
56
|
-
raise ArgumentError, "Unsupported format: #{format}"
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
puts "Saved #{collection.groups.size} groups to #{file_path}"
|
|
60
|
-
end
|
|
61
33
|
end
|
|
62
34
|
end
|
|
63
35
|
end
|