RubyGems - ietf-data-importer - Versions diffs - 0.3.0 → 0.3.1 - Mend

ietf-data-importer 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.github/workflows/check_update.yml +7 -7
data/.gitignore +4 -0
data/.rubocop.yml +8 -1
data/.rubocop_todo.yml +49 -0
data/CLAUDE.md +73 -0
data/Gemfile +1 -2
data/README.adoc +32 -24
data/exe/ietf-data-importer +1 -1
data/ietf-data-importer.gemspec +3 -2
data/lib/ietf/data/importer/cli.rb +14 -23
data/lib/ietf/data/importer/group.rb +39 -4
data/lib/ietf/data/importer/group_collection.rb +101 -1
data/lib/ietf/data/importer/scrapers/base_scraper.rb +18 -9
data/lib/ietf/data/importer/scrapers/ietf_scraper.rb +137 -213
data/lib/ietf/data/importer/scrapers/irtf_scraper.rb +142 -291
data/lib/ietf/data/importer/scrapers.rb +7 -35
data/lib/ietf/data/importer/version.rb +1 -1
data/lib/ietf/data/importer.rb +56 -66
metadata +14 -11

data/lib/ietf/data/importer/scrapers/ietf_scraper.rb CHANGED Viewed

@@ -1,269 +1,193 @@
 # frozen_string_literal: true
 require_relative "base_scraper"
-require_relative "../group_collection"
 module Ietf
   module Data
     module Importer
       module Scrapers
-        # Scraper for IETF groups from datatracker.ietf.org
         class IetfScraper < BaseScraper
-          # Base URL for IETF datatracker
           BASE_URL = "https://datatracker.ietf.org/group/"
-          # Fetch all IETF groups
-          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          STANDARD_TYPES = [
+            { name: "Working Group", abbreviation: "wg", url: "/wg/" },
+            { name: "Research Group", abbreviation: "rg", url: "/rg/" },
+            { name: "Area", abbreviation: "area", url: "/area/" },
+            { name: "Team", abbreviation: "team", url: "/team/" },
+            { name: "Program", abbreviation: "program", url: "/program/" },
+            { name: "Directorate", abbreviation: "dir", url: "/dir/" },
+            { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
+            { name: "BOF", abbreviation: "bof", url: "/bof/" },
+          ].freeze
+          TABLE_SELECTORS = [
+            ".group-list tbody tr",
+            "table.table-sm tbody tr",
+            "table.tablesorter tbody tr",
+          ].freeze
           def fetch
-            groups = []
             log "Fetching IETF groups..."
-            # Fetch all group types
             group_types = fetch_group_types
-            # For each group type, fetch its groups
-            group_types.each do |type|
+            groups = group_types.flat_map do |type|
               log "Fetching #{type[:name]} groups...", 1
+              next [] if type[:url].nil? || type[:url].empty?
-              # Skip if URL is empty
-              next if type[:url].nil? || type[:url].empty?
-              # Construct the full URL
-              type_url = if type[:url].start_with?('/')
-                "https://datatracker.ietf.org#{type[:url]}"
-              else
-                "https://datatracker.ietf.org/#{type[:url]}"
-              end
+              type_url = resolve_url(type[:url])
               type_doc = fetch_html(type_url)
-              next unless type_doc
+              next [] unless type_doc
-              # Extract groups from the table
-              extract_groups_from_table(type_doc, type, groups)
+              extract_groups_from_table(type_doc, type)
             end
-            groups
+            build_collection(groups)
           end
           private
-          # Fetch all group types from the main IETF groups page
-          # @return [Array<Hash>] Array of group type information
+          def resolve_url(path)
+            if path.start_with?("/")
+              "https://datatracker.ietf.org#{path}"
+            else
+              "https://datatracker.ietf.org/#{path}"
+            end
+          end
           def fetch_group_types
             doc = fetch_html(BASE_URL)
-            return [] unless doc
+            return STANDARD_TYPES unless doc
             log "Looking for group types on the page...", 1
-            # Extract group types from the table on the main page
-            group_types = []
-            # Try to find from the table first
-            doc.css('table.tablesorter tbody tr').each do |row|
-              type_cell = row.at_css('td a')
-              next unless type_cell && type_cell['href']
+            discovered = discover_group_types(doc)
+            if discovered.empty?
+              log "Using predefined group types...", 1
+              STANDARD_TYPES
+            else
+              log "Found #{discovered.size} group types: #{discovered.map do |t|
+                t[:abbreviation]
+              end.join(', ')}", 1
+              discovered
+            end
+          end
-              href = type_cell['href']
-              next unless href.include?('/')
+          def discover_group_types(doc)
+            doc.css("table.tablesorter tbody tr").filter_map do |row|
+              type_cell = row.at_css("td a")
+              next unless type_cell && type_cell["href"]
-              type_abbr = href.sub(/\/$/, '').split('/').last
-              name = type_cell.text.strip
+              href = type_cell["href"]
+              next unless href.include?("/")
-              group_types << {
-                name: name,
-                abbreviation: type_abbr.downcase,
-                url: href
+              {
+                name: type_cell.text.strip,
+                abbreviation: href.sub(%r{/$}, "").split("/").last.downcase,
+                url: href,
               }
             end
-            # If we didn't find any types in the table, use the predefined list
-            if group_types.empty?
-              log "Using predefined group types...", 1
-              standard_types = [
-                { name: "Working Group", abbreviation: "wg", url: "/wg/" },
-                { name: "Research Group", abbreviation: "rg", url: "/rg/" },
-                { name: "Area", abbreviation: "area", url: "/area/" },
-                { name: "Team", abbreviation: "team", url: "/team/" },
-                { name: "Program", abbreviation: "program", url: "/program/" },
-                { name: "Directorate", abbreviation: "dir", url: "/dir/" },
-                { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
-                { name: "BOF", abbreviation: "bof", url: "/bof/" }
-              ]
-              group_types = standard_types
-            end
-            log "Found #{group_types.size} group types: #{group_types.map { |t| t[:abbreviation] }.join(', ')}", 1
-            group_types
           end
-          # Extract groups from a table on the group type page
-          # @param doc [Nokogiri::HTML::Document] The HTML document
-          # @param type [Hash] The group type information
-          # @param groups [Array<Ietf::Data::Importer::Group>] Array to add groups to
-          def extract_groups_from_table(doc, type, groups)
-            # Try different table selectors
-            selectors = [
-              '.group-list tbody tr',            # Traditional format
-              'table.table-sm tbody tr',         # New table format
-              'table.tablesorter tbody tr'       # Another possible format
-            ]
-            rows = []
-            selectors.each do |selector|
-              found_rows = doc.css(selector)
-              if found_rows.any?
-                log "Found #{found_rows.size} groups using selector: #{selector}", 2
-                rows = found_rows
-                break
-              end
-            end
-            rows.each do |row|
-              # Try different selectors for finding the abbreviation and name
-              abbreviation = nil
-              name = nil
-              # First, try to find the abbreviation and name using standard classes
-              abbreviation ||= row.at_css('.acronym')&.text&.strip
-              name ||= row.at_css('.name')&.text&.strip
-              # If that doesn't work, try to find by column position
-              if abbreviation.nil? || name.nil?
-                # First column might be the abbreviation, second might be the name
-                cells = row.css('td')
-                if cells.size >= 2
-                  abbreviation ||= cells[0].text.strip
-                  name ||= cells[1].text.strip
-                end
-              end
-              # If we still don't have them, try to extract from links
-              if abbreviation.nil? || name.nil?
-                link = row.at_css('a')
-                if link
-                  # Try to extract abbreviation from the URL
-                  if link['href'] =~ %r{/([^/]+)/?$}
-                    abbreviation ||= $1.upcase
-                  end
-                  # Use link text as the name
-                  name ||= link.text.strip
-                end
-              end
+          def extract_groups_from_table(doc, type)
+            rows = TABLE_SELECTORS.filter_map do |selector|
+              found = doc.css(selector)
+              found.any? ? found : nil
+            end.first || []
-              # Skip if we still couldn't extract basic info
-              next unless abbreviation && name && !abbreviation.empty? && !name.empty?
-              # Extract other fields from the row
-              status = 'active'  # Default to active
-              # Try to find status from row classes or content
-              status = 'concluded' if row['class'] && row['class'].include?('concluded')
-              status = 'concluded' if row.text.include?('Concluded')
-              status = 'active' if row.at_css('.active') || row.text.include?('Active')
-              # Try to find the area
-              area = nil
-              area_element = row.at_css('.area')
-              area = area_element.text.strip if area_element
-              # Get the group detail page URL
-              detail_link = row.at_css('a')
-              next unless detail_link
-              group_url = detail_link['href']
-              detail_url = URI.join(BASE_URL, group_url)
-              # Fetch additional details from the group's page
-              begin
-                details = fetch_group_details(detail_url)
-                # Create Group object
-                group = Importer::Group.new(
-                  abbreviation: abbreviation,
-                  name: name,
-                  organization: 'ietf',
-                  type: type[:abbreviation],
-                  area: area,
-                  status: status,
-                  description: details[:description],
-                  chairs: details[:chairs],
-                  mailing_list: details[:mailing_list],
-                  mailing_list_archive: details[:mailing_list_archive],
-                  website_url: details[:website_url],
-                  charter_url: details[:charter_url],
-                  concluded_date: details[:concluded_date]
-                )
-                groups << group
-              rescue => e
-                log "Error fetching details for #{abbreviation}: #{e.message}", 2
-              end
+            rows.filter_map do |row|
+              extract_group_from_row(row, type)
             end
           end
-          # Fetch details for a specific group from its page
-          # @param url [String] The URL of the group's page
-          # @return [Hash] Hash of group details
-          def fetch_group_details(url)
-            details = {
-              description: nil,
-              chairs: [],
-              mailing_list: nil,
-              mailing_list_archive: nil,
-              website_url: nil,
-              charter_url: nil,
-              concluded_date: nil
-            }
+          def extract_group_from_row(row, type)
+            basic = extract_basic_info(row)
+            return nil unless basic[:abbreviation] && basic[:name]
+            status = determine_status(row)
+            area = row.at_css(".area")&.text&.strip
+            detail_link = row.at_css("a")
+            return nil unless detail_link
+            detail_url = URI.join(BASE_URL, detail_link["href"])
+            details = fetch_group_details(detail_url)
+            build_group(
+              abbreviation: basic[:abbreviation],
+              name: basic[:name],
+              organization: "ietf",
+              type: type[:abbreviation],
+              area: area,
+              status: status,
+              **details,
+            )
+          rescue StandardError => e
+            log "Error fetching details for #{basic&.dig(:abbreviation)}: #{e.message}",
+                2
+            nil
+          end
-            doc = fetch_html(url)
-            return details unless doc
+          def extract_basic_info(row)
+            abbreviation = row.at_css(".acronym")&.text&.strip
+            name = row.at_css(".name")&.text&.strip
-            # Extract description from charter
-            charter_section = doc.at_css('#charter')
-            if charter_section
-              details[:description] = charter_section.text.strip
+            if abbreviation.nil? || name.nil?
+              cells = row.css("td")
+              if cells.size >= 2
+                abbreviation ||= cells[0].text.strip
+                name ||= cells[1].text.strip
+              end
             end
-            # Extract chairs
-            doc.css('.role-WG-chair, .role-RG-chair').each do |chair|
-              details[:chairs] << chair.text.strip
+            if abbreviation.nil? || name.nil?
+              link = row.at_css("a")
+              if link
+                abbreviation ||= $1.upcase if link["href"] =~ %r{/([^/]+)/?$}
+                name ||= link.text.strip
+              end
             end
-            # Extract mailing list
-            mailing_list = doc.at_css('a[href^="mailto:"]')
-            if mailing_list
-              details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
-            end
+            { abbreviation: abbreviation, name: name }
+          end
-            # Extract mailing list archive
-            archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
-            if archive
-              details[:mailing_list_archive] = archive['href']
-            end
+          def determine_status(row)
+            return "concluded" if row["class"]&.include?("concluded")
+            return "concluded" if row.text.include?("Concluded")
+            return "active" if row.at_css(".active") || row.text.include?("Active")
-            # Extract website if available
-            website = doc.at_css('.additional-urls a')
-            if website
-              details[:website_url] = website['href']
-            end
+            "active"
+          end
-            # Extract charter URL
-            charter_link = doc.at_css('a[href*="/charter/"]')
-            if charter_link
-              details[:charter_url] = URI.join("https://datatracker.ietf.org", charter_link['href']).to_s
-            end
+          def fetch_group_details(url)
+            doc = fetch_html(url)
+            return {} unless doc
+            {
+              description: doc.at_css("#charter")&.text&.strip,
+              chairs: doc.css(".role-WG-chair, .role-RG-chair").map do |c|
+                c.text.strip
+              end,
+              mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
+                "mailto:", ""
+              ),
+              mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
+              website_url: doc.at_css(".additional-urls a")&.[]("href"),
+              charter_url: extract_charter_url(doc),
+              concluded_date: extract_concluded_date(doc),
+            }
+          end
-            # Extract concluded date
-            concluded_info = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
-            if concluded_info
-              begin
-                details[:concluded_date] = Date.parse(concluded_info[1])
-              rescue
-                # Just leave it as nil if we can't parse it
-              end
-            end
+          def extract_charter_url(doc)
+            link = doc.at_css('a[href*="/charter/"]')
+            URI.join("https://datatracker.ietf.org", link["href"]).to_s if link
+          end
-            details
+          def extract_concluded_date(doc)
+            match = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
+            Date.parse(match[1]) if match
+          rescue Date::Error
+            nil
           end
         end
       end