RubyGems - ietf-data-importer - Versions diffs - 0.3.0 - Mend

ietf-data-importer 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/.github/workflows/check_update.yml +44 -0
data/.github/workflows/rake.yml +15 -0
data/.github/workflows/release.yml +25 -0
data/.gitignore +2 -0
data/.rubocop.yml +10 -0
data/Gemfile +11 -0
data/README.adoc +220 -0
data/Rakefile +8 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/exe/ietf-data-importer +6 -0
data/ietf-data-importer.gemspec +37 -0
data/lib/ietf/data/importer/cli.rb +47 -0
data/lib/ietf/data/importer/group.rb +42 -0
data/lib/ietf/data/importer/group_collection.rb +19 -0
data/lib/ietf/data/importer/groups.yaml +1745 -0
data/lib/ietf/data/importer/scrapers/base_scraper.rb +33 -0
data/lib/ietf/data/importer/scrapers/ietf_scraper.rb +272 -0
data/lib/ietf/data/importer/scrapers/irtf_scraper.rb +350 -0
data/lib/ietf/data/importer/scrapers.rb +64 -0
data/lib/ietf/data/importer/version.rb +9 -0
data/lib/ietf/data/importer.rb +93 -0
metadata +126 -0

data/lib/ietf/data/importer/scrapers/base_scraper.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+require "nokogiri"
+require "open-uri"
+module Ietf
+  module Data
+    module Importer
+      module Scrapers
+        # Base class for web scrapers
+        class BaseScraper
+          # Fetch HTML content from a URL and parse it with Nokogiri
+          # @param url [String] The URL to fetch
+          # @return [Nokogiri::HTML::Document] The parsed HTML document
+          def fetch_html(url)
+            Nokogiri::HTML(URI.open(url))
+          rescue => e
+            puts "  Error fetching URL #{url}: #{e.message}"
+            nil
+          end
+          # Log a message with indentation
+          # @param message [String] The message to log
+          # @param level [Integer] The indentation level (default: 0)
+          def log(message, level = 0)
+            indent = "  " * level
+            puts "#{indent}#{message}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ietf/data/importer/scrapers/ietf_scraper.rb ADDED Viewed

@@ -0,0 +1,272 @@
+# frozen_string_literal: true
+require_relative "base_scraper"
+require_relative "../group_collection"
+module Ietf
+  module Data
+    module Importer
+      module Scrapers
+        # Scraper for IETF groups from datatracker.ietf.org
+        class IetfScraper < BaseScraper
+          # Base URL for IETF datatracker
+          BASE_URL = "https://datatracker.ietf.org/group/"
+          # Fetch all IETF groups
+          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          def fetch
+            groups = []
+            log "Fetching IETF groups..."
+            # Fetch all group types
+            group_types = fetch_group_types
+            # For each group type, fetch its groups
+            group_types.each do |type|
+              log "Fetching #{type[:name]} groups...", 1
+              # Skip if URL is empty
+              next if type[:url].nil? || type[:url].empty?
+              # Construct the full URL
+              type_url = if type[:url].start_with?('/')
+                "https://datatracker.ietf.org#{type[:url]}"
+              else
+                "https://datatracker.ietf.org/#{type[:url]}"
+              end
+              type_doc = fetch_html(type_url)
+              next unless type_doc
+              # Extract groups from the table
+              extract_groups_from_table(type_doc, type, groups)
+            end
+            groups
+          end
+          private
+          # Fetch all group types from the main IETF groups page
+          # @return [Array<Hash>] Array of group type information
+          def fetch_group_types
+            doc = fetch_html(BASE_URL)
+            return [] unless doc
+            log "Looking for group types on the page...", 1
+            # Extract group types from the table on the main page
+            group_types = []
+            # Try to find from the table first
+            doc.css('table.tablesorter tbody tr').each do |row|
+              type_cell = row.at_css('td a')
+              next unless type_cell && type_cell['href']
+              href = type_cell['href']
+              next unless href.include?('/')
+              type_abbr = href.sub(/\/$/, '').split('/').last
+              name = type_cell.text.strip
+              group_types << {
+                name: name,
+                abbreviation: type_abbr.downcase,
+                url: href
+              }
+            end
+            # If we didn't find any types in the table, use the predefined list
+            if group_types.empty?
+              log "Using predefined group types...", 1
+              standard_types = [
+                { name: "Working Group", abbreviation: "wg", url: "/wg/" },
+                { name: "Research Group", abbreviation: "rg", url: "/rg/" },
+                { name: "Area", abbreviation: "area", url: "/area/" },
+                { name: "Team", abbreviation: "team", url: "/team/" },
+                { name: "Program", abbreviation: "program", url: "/program/" },
+                { name: "Directorate", abbreviation: "dir", url: "/dir/" },
+                { name: "Advisory Group", abbreviation: "ag", url: "/ag/" },
+                { name: "BOF", abbreviation: "bof", url: "/bof/" }
+              ]
+              group_types = standard_types
+            end
+            log "Found #{group_types.size} group types: #{group_types.map { |t| t[:abbreviation] }.join(', ')}", 1
+            group_types
+          end
+          # Extract groups from a table on the group type page
+          # @param doc [Nokogiri::HTML::Document] The HTML document
+          # @param type [Hash] The group type information
+          # @param groups [Array<Ietf::Data::Importer::Group>] Array to add groups to
+          def extract_groups_from_table(doc, type, groups)
+            # Try different table selectors
+            selectors = [
+              '.group-list tbody tr',            # Traditional format
+              'table.table-sm tbody tr',         # New table format
+              'table.tablesorter tbody tr'       # Another possible format
+            ]
+            rows = []
+            selectors.each do |selector|
+              found_rows = doc.css(selector)
+              if found_rows.any?
+                log "Found #{found_rows.size} groups using selector: #{selector}", 2
+                rows = found_rows
+                break
+              end
+            end
+            rows.each do |row|
+              # Try different selectors for finding the abbreviation and name
+              abbreviation = nil
+              name = nil
+              # First, try to find the abbreviation and name using standard classes
+              abbreviation ||= row.at_css('.acronym')&.text&.strip
+              name ||= row.at_css('.name')&.text&.strip
+              # If that doesn't work, try to find by column position
+              if abbreviation.nil? || name.nil?
+                # First column might be the abbreviation, second might be the name
+                cells = row.css('td')
+                if cells.size >= 2
+                  abbreviation ||= cells[0].text.strip
+                  name ||= cells[1].text.strip
+                end
+              end
+              # If we still don't have them, try to extract from links
+              if abbreviation.nil? || name.nil?
+                link = row.at_css('a')
+                if link
+                  # Try to extract abbreviation from the URL
+                  if link['href'] =~ %r{/([^/]+)/?$}
+                    abbreviation ||= $1.upcase
+                  end
+                  # Use link text as the name
+                  name ||= link.text.strip
+                end
+              end
+              # Skip if we still couldn't extract basic info
+              next unless abbreviation && name && !abbreviation.empty? && !name.empty?
+              # Extract other fields from the row
+              status = 'active'  # Default to active
+              # Try to find status from row classes or content
+              status = 'concluded' if row['class'] && row['class'].include?('concluded')
+              status = 'concluded' if row.text.include?('Concluded')
+              status = 'active' if row.at_css('.active') || row.text.include?('Active')
+              # Try to find the area
+              area = nil
+              area_element = row.at_css('.area')
+              area = area_element.text.strip if area_element
+              # Get the group detail page URL
+              detail_link = row.at_css('a')
+              next unless detail_link
+              group_url = detail_link['href']
+              detail_url = URI.join(BASE_URL, group_url)
+              # Fetch additional details from the group's page
+              begin
+                details = fetch_group_details(detail_url)
+                # Create Group object
+                group = Importer::Group.new(
+                  abbreviation: abbreviation,
+                  name: name,
+                  organization: 'ietf',
+                  type: type[:abbreviation],
+                  area: area,
+                  status: status,
+                  description: details[:description],
+                  chairs: details[:chairs],
+                  mailing_list: details[:mailing_list],
+                  mailing_list_archive: details[:mailing_list_archive],
+                  website_url: details[:website_url],
+                  charter_url: details[:charter_url],
+                  concluded_date: details[:concluded_date]
+                )
+                groups << group
+              rescue => e
+                log "Error fetching details for #{abbreviation}: #{e.message}", 2
+              end
+            end
+          end
+          # Fetch details for a specific group from its page
+          # @param url [String] The URL of the group's page
+          # @return [Hash] Hash of group details
+          def fetch_group_details(url)
+            details = {
+              description: nil,
+              chairs: [],
+              mailing_list: nil,
+              mailing_list_archive: nil,
+              website_url: nil,
+              charter_url: nil,
+              concluded_date: nil
+            }
+            doc = fetch_html(url)
+            return details unless doc
+            # Extract description from charter
+            charter_section = doc.at_css('#charter')
+            if charter_section
+              details[:description] = charter_section.text.strip
+            end
+            # Extract chairs
+            doc.css('.role-WG-chair, .role-RG-chair').each do |chair|
+              details[:chairs] << chair.text.strip
+            end
+            # Extract mailing list
+            mailing_list = doc.at_css('a[href^="mailto:"]')
+            if mailing_list
+              details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
+            end
+            # Extract mailing list archive
+            archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
+            if archive
+              details[:mailing_list_archive] = archive['href']
+            end
+            # Extract website if available
+            website = doc.at_css('.additional-urls a')
+            if website
+              details[:website_url] = website['href']
+            end
+            # Extract charter URL
+            charter_link = doc.at_css('a[href*="/charter/"]')
+            if charter_link
+              details[:charter_url] = URI.join("https://datatracker.ietf.org", charter_link['href']).to_s
+            end
+            # Extract concluded date
+            concluded_info = doc.text.match(/Concluded\s+([A-Z][a-z]+\s+\d{4})/)
+            if concluded_info
+              begin
+                details[:concluded_date] = Date.parse(concluded_info[1])
+              rescue
+                # Just leave it as nil if we can't parse it
+              end
+            end
+            details
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ietf/data/importer/scrapers/irtf_scraper.rb ADDED Viewed

@@ -0,0 +1,350 @@
+# frozen_string_literal: true
+require_relative "base_scraper"
+require_relative "../group_collection"
+module Ietf
+  module Data
+    module Importer
+      module Scrapers
+        # Scraper for IRTF groups from irtf.org
+        class IrtfScraper < BaseScraper
+          # Base URL for IRTF website
+          BASE_URL = "https://www.irtf.org/groups.html"
+          # Fetch all IRTF groups
+          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          def fetch
+            groups = []
+            log "Fetching IRTF groups..."
+            begin
+              doc = fetch_html(BASE_URL)
+              return [] unless doc
+              # First try to extract from the dropdown menu
+              dropdown_groups = extract_from_dropdown(doc)
+              if dropdown_groups.any?
+                log "Found #{dropdown_groups.size} groups in dropdown menu", 1
+                groups.concat(dropdown_groups)
+                return groups
+              end
+              # If dropdown extraction fails, fall back to traditional section-based extraction
+              # Debug the page structure
+              headings = doc.css('h3').map(&:text).join(', ')
+              log "Found headings on IRTF page: #{headings}", 1
+              # Extract active groups
+              active_groups = extract_groups(doc, 'Active Research Groups', 'active')
+              log "Found #{active_groups.size} active IRTF groups", 1
+              # Extract concluded groups
+              concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
+              log "Found #{concluded_groups.size} concluded IRTF groups", 1
+              groups.concat(active_groups)
+              groups.concat(concluded_groups)
+              # If still no groups found, try alternative selectors
+              if groups.empty?
+                log "No groups found with standard selectors, trying alternatives...", 1
+                # Try different section titles
+                ['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
+                  section_groups = extract_groups(doc, title, 'active')
+                  if section_groups.any?
+                    log "Found #{section_groups.size} groups with section title: #{title}", 1
+                    groups.concat(section_groups)
+                  end
+                end
+                # Try a more generic approach if still no groups
+                if groups.empty?
+                  log "Using generic list item selector...", 1
+                  # Find any unordered list with links
+                  doc.css('ul').each do |list|
+                    if list.css('li a').any?
+                      generic_groups = extract_groups_from_list(list, 'active')
+                      if generic_groups.any?
+                        log "Found #{generic_groups.size} groups using generic list selector", 1
+                        groups.concat(generic_groups)
+                      end
+                    end
+                  end
+                end
+              end
+            rescue => e
+              log "Error fetching IRTF groups: #{e.message}", 1
+            end
+            groups
+          end
+          # Extract groups from the dropdown menu
+          # @param doc [Nokogiri::HTML::Document] The HTML document
+          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          def extract_from_dropdown(doc)
+            groups = []
+            # Look for the dropdown menu containing research groups
+            dropdown = doc.css('a.dropdown-toggle').find do |el|
+              el.text.include?('Research Groups')
+            end
+            return [] unless dropdown
+            # Find the dropdown menu
+            dropdown_parent = dropdown.parent
+            dropdown_menu = dropdown_parent.css('.dropdown-menu')
+            return [] unless dropdown_menu.any?
+            log "Found dropdown menu with research groups", 1
+            # Extract groups from the dropdown menu
+            dropdown_menu.css('a.dropdown-item').each do |link|
+              next unless link && link['href']
+              name = link.text.strip
+              href = link['href']
+              # Extract abbreviation from href (e.g., cfrg.html -> CFRG)
+              if href =~ /(\w+)\.html$/
+                abbreviation = $1.upcase
+              else
+                next # Skip if we can't determine abbreviation
+              end
+              # Construct full URL if it's a relative path
+              details_url = href
+              if !details_url.start_with?('http')
+                if details_url.start_with?('/')
+                  details_url = "https://www.irtf.org#{details_url}"
+                else
+                  details_url = "https://www.irtf.org/#{details_url}"
+                end
+              end
+              begin
+                details = fetch_group_details(details_url)
+                group = Importer::Group.new(
+                  abbreviation: abbreviation,
+                  name: name,
+                  organization: 'irtf',
+                  type: 'rg',
+                  area: nil,
+                  status: 'active', # Assume active since it's in the menu
+                  description: nil, # Will be populated from details page if available
+                  chairs: details[:chairs],
+                  mailing_list: details[:mailing_list],
+                  mailing_list_archive: details[:mailing_list_archive],
+                  website_url: details_url,
+                  charter_url: details[:charter_url],
+                  concluded_date: details[:concluded_date]
+                )
+                groups << group
+              rescue => e
+                log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
+              end
+            end
+            groups
+          end
+          private
+          # Extract groups from a section on the IRTF page
+          # @param doc [Nokogiri::HTML::Document] The HTML document
+          # @param section_title [String] The title of the section to extract from
+          # @param status [String] The status of the groups in this section (active/concluded)
+          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          def extract_groups(doc, section_title, status)
+            groups = []
+            section = doc.xpath("//h3[contains(text(), '#{section_title}')]/following-sibling::ul[1]")
+            section.css('li').each do |group_item|
+              link = group_item.at_css('a')
+              next unless link
+              name = link.text.strip
+              abbreviation = nil
+              # Extract abbreviation from the text (typically in parentheses)
+              if name =~ /\(([^)]+)\)/
+                abbreviation = $1
+              end
+              # If unable to extract abbreviation, try from the URL
+              if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
+                abbreviation = $1.upcase
+              end
+              next unless abbreviation
+              # Extract description (text after the link)
+              description = group_item.text.sub(link.text, '').strip
+              # Remove parenthesized abbreviation from description
+              description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
+              # Get details from the group's page
+              details_url = link['href']
+              begin
+                details = fetch_group_details(details_url)
+                group = Importer::Group.new(
+                  abbreviation: abbreviation,
+                  name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
+                  organization: 'irtf',
+                  type: 'rg',
+                  area: nil,
+                  status: status,
+                  description: description,
+                  chairs: details[:chairs],
+                  mailing_list: details[:mailing_list],
+                  mailing_list_archive: details[:mailing_list_archive],
+                  website_url: details_url,
+                  charter_url: details[:charter_url],
+                  concluded_date: details[:concluded_date]
+                )
+                groups << group
+              rescue => e
+                log "Error fetching details for #{abbreviation}: #{e.message}", 2
+              end
+            end
+            groups
+          end
+          # Helper method to extract groups from any list without requiring a specific section heading
+          # @param list_element [Nokogiri::XML::Element] The list element to extract from
+          # @param status [String] The status of the groups in this list (active/concluded)
+          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          def extract_groups_from_list(list_element, status)
+            groups = []
+            list_element.css('li').each do |group_item|
+              link = group_item.at_css('a')
+              next unless link && link['href']
+              name = link.text.strip
+              abbreviation = nil
+              # Extract abbreviation from the text (typically in parentheses)
+              if name =~ /\(([^)]+)\)/
+                abbreviation = $1
+              end
+              # If unable to extract abbreviation, try from the URL
+              if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
+                abbreviation = $1.upcase
+              end
+              next unless abbreviation && !abbreviation.empty?
+              # Extract description (text after the link)
+              description = group_item.text.sub(link.text, '').strip
+              # Remove parenthesized abbreviation from description
+              description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
+              # Get details from the group's page
+              details_url = link['href']
+              # Ensure we have a full URL
+              if !details_url.start_with?('http')
+                if details_url.start_with?('/')
+                  details_url = "https://www.irtf.org#{details_url}"
+                else
+                  details_url = "https://www.irtf.org/#{details_url}"
+                end
+              end
+              begin
+                details = fetch_group_details(details_url)
+                group = Importer::Group.new(
+                  abbreviation: abbreviation,
+                  name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
+                  organization: 'irtf',
+                  type: 'rg',
+                  area: nil,
+                  status: status,
+                  description: description,
+                  chairs: details[:chairs],
+                  mailing_list: details[:mailing_list],
+                  mailing_list_archive: details[:mailing_list_archive],
+                  website_url: details_url,
+                  charter_url: details[:charter_url],
+                  concluded_date: details[:concluded_date]
+                )
+                groups << group
+              rescue => e
+                log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
+              end
+            end
+            groups
+          end
+          # Fetch details for a specific IRTF group from its page
+          # @param url [String] The URL of the group's page
+          # @return [Hash] Hash of group details
+          def fetch_group_details(url)
+            details = {
+              chairs: [],
+              mailing_list: nil,
+              mailing_list_archive: nil,
+              charter_url: nil,
+              concluded_date: nil
+            }
+            doc = fetch_html(url)
+            return details unless doc
+            # Extract chairs
+            chair_section = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
+            if chair_section
+              details[:chairs] << chair_section.text.strip
+            end
+            # Extract mailing list
+            mailing_list = doc.at_css('a[href^="mailto:"]')
+            if mailing_list
+              details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
+            end
+            # Extract mailing list archive
+            archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
+            if archive
+              details[:mailing_list_archive] = archive['href']
+            end
+            # Extract charter URL
+            charter_link = doc.at_css('a[href*="charter"]')
+            if charter_link
+              details[:charter_url] = URI.join(url, charter_link['href']).to_s
+            end
+            # Extract concluded date from the page or the URL
+            if url.include?('/concluded/')
+              concluded_info = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
+              if concluded_info
+                begin
+                  details[:concluded_date] = Date.parse(concluded_info[1])
+                rescue
+                  # Just leave it as nil if we can't parse it
+                end
+              end
+            end
+            details
+          end
+        end
+      end
+    end
+  end
+end