ietf-data-importer 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/check_update.yml +7 -7
- data/.gitignore +4 -0
- data/.rubocop.yml +8 -1
- data/.rubocop_todo.yml +49 -0
- data/CLAUDE.md +73 -0
- data/Gemfile +1 -2
- data/README.adoc +32 -24
- data/exe/ietf-data-importer +1 -1
- data/ietf-data-importer.gemspec +3 -2
- data/lib/ietf/data/importer/cli.rb +14 -23
- data/lib/ietf/data/importer/group.rb +39 -4
- data/lib/ietf/data/importer/group_collection.rb +101 -1
- data/lib/ietf/data/importer/scrapers/base_scraper.rb +18 -9
- data/lib/ietf/data/importer/scrapers/ietf_scraper.rb +137 -213
- data/lib/ietf/data/importer/scrapers/irtf_scraper.rb +142 -291
- data/lib/ietf/data/importer/scrapers.rb +7 -35
- data/lib/ietf/data/importer/version.rb +1 -1
- data/lib/ietf/data/importer.rb +56 -66
- metadata +14 -11
|
@@ -1,269 +1,193 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "base_scraper"
|
|
4
|
-
require_relative "../group_collection"
|
|
5
4
|
|
|
6
5
|
module Ietf
|
|
7
6
|
module Data
|
|
8
7
|
module Importer
|
|
9
8
|
module Scrapers
|
|
10
|
-
# Scraper for IETF groups from datatracker.ietf.org
|
|
11
9
|
class IetfScraper < BaseScraper
|
|
12
|
-
# Base URL for IETF datatracker
|
|
13
10
|
BASE_URL = "https://datatracker.ietf.org/group/"
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
STANDARD_TYPES = [
|
|
13
|
+
{ name: "Working Group", abbreviation: "wg", url: "/wg/" },
|
|
14
|
+
{ name: "Research Group", abbreviation: "rg", url: "/rg/" },
|
|
15
|
+
{ name: "Area", abbreviation: "area", url: "/area/" },
|
|
16
|
+
{ name: "Team", abbreviation: "team", url: "/team/" },
|
|
17
|
+
{ name: "Program", abbreviation: "program", url: "/program/" },
|
|
18
|
+
{ name: "Directorate", abbreviation: "dir", url: "/dir/" },
|
|
19
|
+
{ name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
|
|
20
|
+
{ name: "BOF", abbreviation: "bof", url: "/bof/" },
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
TABLE_SELECTORS = [
|
|
24
|
+
".group-list tbody tr",
|
|
25
|
+
"table.table-sm tbody tr",
|
|
26
|
+
"table.tablesorter tbody tr",
|
|
27
|
+
].freeze
|
|
28
|
+
|
|
17
29
|
def fetch
|
|
18
|
-
groups = []
|
|
19
30
|
log "Fetching IETF groups..."
|
|
20
31
|
|
|
21
|
-
# Fetch all group types
|
|
22
32
|
group_types = fetch_group_types
|
|
23
33
|
|
|
24
|
-
|
|
25
|
-
group_types.each do |type|
|
|
34
|
+
groups = group_types.flat_map do |type|
|
|
26
35
|
log "Fetching #{type[:name]} groups...", 1
|
|
36
|
+
next [] if type[:url].nil? || type[:url].empty?
|
|
27
37
|
|
|
28
|
-
|
|
29
|
-
next if type[:url].nil? || type[:url].empty?
|
|
30
|
-
|
|
31
|
-
# Construct the full URL
|
|
32
|
-
type_url = if type[:url].start_with?('/')
|
|
33
|
-
"https://datatracker.ietf.org#{type[:url]}"
|
|
34
|
-
else
|
|
35
|
-
"https://datatracker.ietf.org/#{type[:url]}"
|
|
36
|
-
end
|
|
38
|
+
type_url = resolve_url(type[:url])
|
|
37
39
|
type_doc = fetch_html(type_url)
|
|
38
|
-
next unless type_doc
|
|
40
|
+
next [] unless type_doc
|
|
39
41
|
|
|
40
|
-
|
|
41
|
-
extract_groups_from_table(type_doc, type, groups)
|
|
42
|
+
extract_groups_from_table(type_doc, type)
|
|
42
43
|
end
|
|
43
44
|
|
|
44
|
-
groups
|
|
45
|
+
build_collection(groups)
|
|
45
46
|
end
|
|
46
47
|
|
|
47
48
|
private
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
def resolve_url(path)
|
|
51
|
+
if path.start_with?("/")
|
|
52
|
+
"https://datatracker.ietf.org#{path}"
|
|
53
|
+
else
|
|
54
|
+
"https://datatracker.ietf.org/#{path}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
51
58
|
def fetch_group_types
|
|
52
59
|
doc = fetch_html(BASE_URL)
|
|
53
|
-
return
|
|
60
|
+
return STANDARD_TYPES unless doc
|
|
54
61
|
|
|
55
62
|
log "Looking for group types on the page...", 1
|
|
56
63
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
+
discovered = discover_group_types(doc)
|
|
65
|
+
if discovered.empty?
|
|
66
|
+
log "Using predefined group types...", 1
|
|
67
|
+
STANDARD_TYPES
|
|
68
|
+
else
|
|
69
|
+
log "Found #{discovered.size} group types: #{discovered.map do |t|
|
|
70
|
+
t[:abbreviation]
|
|
71
|
+
end.join(', ')}", 1
|
|
72
|
+
discovered
|
|
73
|
+
end
|
|
74
|
+
end
|
|
64
75
|
|
|
65
|
-
|
|
66
|
-
|
|
76
|
+
def discover_group_types(doc)
|
|
77
|
+
doc.css("table.tablesorter tbody tr").filter_map do |row|
|
|
78
|
+
type_cell = row.at_css("td a")
|
|
79
|
+
next unless type_cell && type_cell["href"]
|
|
67
80
|
|
|
68
|
-
|
|
69
|
-
|
|
81
|
+
href = type_cell["href"]
|
|
82
|
+
next unless href.include?("/")
|
|
70
83
|
|
|
71
|
-
|
|
72
|
-
name:
|
|
73
|
-
abbreviation:
|
|
74
|
-
url: href
|
|
84
|
+
{
|
|
85
|
+
name: type_cell.text.strip,
|
|
86
|
+
abbreviation: href.sub(%r{/$}, "").split("/").last.downcase,
|
|
87
|
+
url: href,
|
|
75
88
|
}
|
|
76
89
|
end
|
|
77
|
-
|
|
78
|
-
# If we didn't find any types in the table, use the predefined list
|
|
79
|
-
if group_types.empty?
|
|
80
|
-
log "Using predefined group types...", 1
|
|
81
|
-
standard_types = [
|
|
82
|
-
{ name: "Working Group", abbreviation: "wg", url: "/wg/" },
|
|
83
|
-
{ name: "Research Group", abbreviation: "rg", url: "/rg/" },
|
|
84
|
-
{ name: "Area", abbreviation: "area", url: "/area/" },
|
|
85
|
-
{ name: "Team", abbreviation: "team", url: "/team/" },
|
|
86
|
-
{ name: "Program", abbreviation: "program", url: "/program/" },
|
|
87
|
-
{ name: "Directorate", abbreviation: "dir", url: "/dir/" },
|
|
88
|
-
{ name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
|
|
89
|
-
{ name: "BOF", abbreviation: "bof", url: "/bof/" }
|
|
90
|
-
]
|
|
91
|
-
group_types = standard_types
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
log "Found #{group_types.size} group types: #{group_types.map { |t| t[:abbreviation] }.join(', ')}", 1
|
|
95
|
-
group_types
|
|
96
90
|
end
|
|
97
91
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# Try different table selectors
|
|
104
|
-
selectors = [
|
|
105
|
-
'.group-list tbody tr', # Traditional format
|
|
106
|
-
'table.table-sm tbody tr', # New table format
|
|
107
|
-
'table.tablesorter tbody tr' # Another possible format
|
|
108
|
-
]
|
|
109
|
-
|
|
110
|
-
rows = []
|
|
111
|
-
selectors.each do |selector|
|
|
112
|
-
found_rows = doc.css(selector)
|
|
113
|
-
if found_rows.any?
|
|
114
|
-
log "Found #{found_rows.size} groups using selector: #{selector}", 2
|
|
115
|
-
rows = found_rows
|
|
116
|
-
break
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
rows.each do |row|
|
|
121
|
-
# Try different selectors for finding the abbreviation and name
|
|
122
|
-
abbreviation = nil
|
|
123
|
-
name = nil
|
|
124
|
-
|
|
125
|
-
# First, try to find the abbreviation and name using standard classes
|
|
126
|
-
abbreviation ||= row.at_css('.acronym')&.text&.strip
|
|
127
|
-
name ||= row.at_css('.name')&.text&.strip
|
|
128
|
-
|
|
129
|
-
# If that doesn't work, try to find by column position
|
|
130
|
-
if abbreviation.nil? || name.nil?
|
|
131
|
-
# First column might be the abbreviation, second might be the name
|
|
132
|
-
cells = row.css('td')
|
|
133
|
-
if cells.size >= 2
|
|
134
|
-
abbreviation ||= cells[0].text.strip
|
|
135
|
-
name ||= cells[1].text.strip
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
# If we still don't have them, try to extract from links
|
|
140
|
-
if abbreviation.nil? || name.nil?
|
|
141
|
-
link = row.at_css('a')
|
|
142
|
-
if link
|
|
143
|
-
# Try to extract abbreviation from the URL
|
|
144
|
-
if link['href'] =~ %r{/([^/]+)/?$}
|
|
145
|
-
abbreviation ||= $1.upcase
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
# Use link text as the name
|
|
149
|
-
name ||= link.text.strip
|
|
150
|
-
end
|
|
151
|
-
end
|
|
92
|
+
def extract_groups_from_table(doc, type)
|
|
93
|
+
rows = TABLE_SELECTORS.filter_map do |selector|
|
|
94
|
+
found = doc.css(selector)
|
|
95
|
+
found.any? ? found : nil
|
|
96
|
+
end.first || []
|
|
152
97
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# Extract other fields from the row
|
|
157
|
-
status = 'active' # Default to active
|
|
158
|
-
|
|
159
|
-
# Try to find status from row classes or content
|
|
160
|
-
status = 'concluded' if row['class'] && row['class'].include?('concluded')
|
|
161
|
-
status = 'concluded' if row.text.include?('Concluded')
|
|
162
|
-
status = 'active' if row.at_css('.active') || row.text.include?('Active')
|
|
163
|
-
|
|
164
|
-
# Try to find the area
|
|
165
|
-
area = nil
|
|
166
|
-
area_element = row.at_css('.area')
|
|
167
|
-
area = area_element.text.strip if area_element
|
|
168
|
-
|
|
169
|
-
# Get the group detail page URL
|
|
170
|
-
detail_link = row.at_css('a')
|
|
171
|
-
next unless detail_link
|
|
172
|
-
|
|
173
|
-
group_url = detail_link['href']
|
|
174
|
-
detail_url = URI.join(BASE_URL, group_url)
|
|
175
|
-
|
|
176
|
-
# Fetch additional details from the group's page
|
|
177
|
-
begin
|
|
178
|
-
details = fetch_group_details(detail_url)
|
|
179
|
-
|
|
180
|
-
# Create Group object
|
|
181
|
-
group = Importer::Group.new(
|
|
182
|
-
abbreviation: abbreviation,
|
|
183
|
-
name: name,
|
|
184
|
-
organization: 'ietf',
|
|
185
|
-
type: type[:abbreviation],
|
|
186
|
-
area: area,
|
|
187
|
-
status: status,
|
|
188
|
-
description: details[:description],
|
|
189
|
-
chairs: details[:chairs],
|
|
190
|
-
mailing_list: details[:mailing_list],
|
|
191
|
-
mailing_list_archive: details[:mailing_list_archive],
|
|
192
|
-
website_url: details[:website_url],
|
|
193
|
-
charter_url: details[:charter_url],
|
|
194
|
-
concluded_date: details[:concluded_date]
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
groups << group
|
|
198
|
-
rescue => e
|
|
199
|
-
log "Error fetching details for #{abbreviation}: #{e.message}", 2
|
|
200
|
-
end
|
|
98
|
+
rows.filter_map do |row|
|
|
99
|
+
extract_group_from_row(row, type)
|
|
201
100
|
end
|
|
202
101
|
end
|
|
203
102
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
103
|
+
def extract_group_from_row(row, type)
|
|
104
|
+
basic = extract_basic_info(row)
|
|
105
|
+
return nil unless basic[:abbreviation] && basic[:name]
|
|
106
|
+
|
|
107
|
+
status = determine_status(row)
|
|
108
|
+
area = row.at_css(".area")&.text&.strip
|
|
109
|
+
|
|
110
|
+
detail_link = row.at_css("a")
|
|
111
|
+
return nil unless detail_link
|
|
112
|
+
|
|
113
|
+
detail_url = URI.join(BASE_URL, detail_link["href"])
|
|
114
|
+
details = fetch_group_details(detail_url)
|
|
115
|
+
|
|
116
|
+
build_group(
|
|
117
|
+
abbreviation: basic[:abbreviation],
|
|
118
|
+
name: basic[:name],
|
|
119
|
+
organization: "ietf",
|
|
120
|
+
type: type[:abbreviation],
|
|
121
|
+
area: area,
|
|
122
|
+
status: status,
|
|
123
|
+
**details,
|
|
124
|
+
)
|
|
125
|
+
rescue StandardError => e
|
|
126
|
+
log "Error fetching details for #{basic&.dig(:abbreviation)}: #{e.message}",
|
|
127
|
+
2
|
|
128
|
+
nil
|
|
129
|
+
end
|
|
217
130
|
|
|
218
|
-
|
|
219
|
-
|
|
131
|
+
def extract_basic_info(row)
|
|
132
|
+
abbreviation = row.at_css(".acronym")&.text&.strip
|
|
133
|
+
name = row.at_css(".name")&.text&.strip
|
|
220
134
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
135
|
+
if abbreviation.nil? || name.nil?
|
|
136
|
+
cells = row.css("td")
|
|
137
|
+
if cells.size >= 2
|
|
138
|
+
abbreviation ||= cells[0].text.strip
|
|
139
|
+
name ||= cells[1].text.strip
|
|
140
|
+
end
|
|
225
141
|
end
|
|
226
142
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
143
|
+
if abbreviation.nil? || name.nil?
|
|
144
|
+
link = row.at_css("a")
|
|
145
|
+
if link
|
|
146
|
+
abbreviation ||= $1.upcase if link["href"] =~ %r{/([^/]+)/?$}
|
|
147
|
+
name ||= link.text.strip
|
|
148
|
+
end
|
|
230
149
|
end
|
|
231
150
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
if mailing_list
|
|
235
|
-
details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
|
|
236
|
-
end
|
|
151
|
+
{ abbreviation: abbreviation, name: name }
|
|
152
|
+
end
|
|
237
153
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
if
|
|
241
|
-
|
|
242
|
-
end
|
|
154
|
+
def determine_status(row)
|
|
155
|
+
return "concluded" if row["class"]&.include?("concluded")
|
|
156
|
+
return "concluded" if row.text.include?("Concluded")
|
|
157
|
+
return "active" if row.at_css(".active") || row.text.include?("Active")
|
|
243
158
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if website
|
|
247
|
-
details[:website_url] = website['href']
|
|
248
|
-
end
|
|
159
|
+
"active"
|
|
160
|
+
end
|
|
249
161
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
162
|
+
def fetch_group_details(url)
|
|
163
|
+
doc = fetch_html(url)
|
|
164
|
+
return {} unless doc
|
|
165
|
+
|
|
166
|
+
{
|
|
167
|
+
description: doc.at_css("#charter")&.text&.strip,
|
|
168
|
+
chairs: doc.css(".role-WG-chair, .role-RG-chair").map do |c|
|
|
169
|
+
c.text.strip
|
|
170
|
+
end,
|
|
171
|
+
mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
|
|
172
|
+
"mailto:", ""
|
|
173
|
+
),
|
|
174
|
+
mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
|
|
175
|
+
website_url: doc.at_css(".additional-urls a")&.[]("href"),
|
|
176
|
+
charter_url: extract_charter_url(doc),
|
|
177
|
+
concluded_date: extract_concluded_date(doc),
|
|
178
|
+
}
|
|
179
|
+
end
|
|
255
180
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
if
|
|
259
|
-
|
|
260
|
-
details[:concluded_date] = Date.parse(concluded_info[1])
|
|
261
|
-
rescue
|
|
262
|
-
# Just leave it as nil if we can't parse it
|
|
263
|
-
end
|
|
264
|
-
end
|
|
181
|
+
def extract_charter_url(doc)
|
|
182
|
+
link = doc.at_css('a[href*="/charter/"]')
|
|
183
|
+
URI.join("https://datatracker.ietf.org", link["href"]).to_s if link
|
|
184
|
+
end
|
|
265
185
|
|
|
266
|
-
|
|
186
|
+
def extract_concluded_date(doc)
|
|
187
|
+
match = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
|
|
188
|
+
Date.parse(match[1]) if match
|
|
189
|
+
rescue Date::Error
|
|
190
|
+
nil
|
|
267
191
|
end
|
|
268
192
|
end
|
|
269
193
|
end
|