RubyGems - ietf-data-importer - Versions diffs - 0.3.0 → 0.3.1 - Mend

ietf-data-importer 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.github/workflows/check_update.yml +7 -7
data/.gitignore +4 -0
data/.rubocop.yml +8 -1
data/.rubocop_todo.yml +49 -0
data/CLAUDE.md +73 -0
data/Gemfile +1 -2
data/README.adoc +32 -24
data/exe/ietf-data-importer +1 -1
data/ietf-data-importer.gemspec +3 -2
data/lib/ietf/data/importer/cli.rb +14 -23
data/lib/ietf/data/importer/group.rb +39 -4
data/lib/ietf/data/importer/group_collection.rb +101 -1
data/lib/ietf/data/importer/scrapers/base_scraper.rb +18 -9
data/lib/ietf/data/importer/scrapers/ietf_scraper.rb +137 -213
data/lib/ietf/data/importer/scrapers/irtf_scraper.rb +142 -291
data/lib/ietf/data/importer/scrapers.rb +7 -35
data/lib/ietf/data/importer/version.rb +1 -1
data/lib/ietf/data/importer.rb +56 -66
metadata +14 -11

data/lib/ietf/data/importer/scrapers/irtf_scraper.rb CHANGED Viewed

@@ -1,347 +1,198 @@
 # frozen_string_literal: true
 require_relative "base_scraper"
-require_relative "../group_collection"
 module Ietf
   module Data
     module Importer
       module Scrapers
-        # Scraper for IRTF groups from irtf.org
         class IrtfScraper < BaseScraper
-          # Base URL for IRTF website
           BASE_URL = "https://www.irtf.org/groups.html"
-          # Fetch all IRTF groups
-          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
+          SECTION_TITLES = [
+            "Active Research Groups",
+            "Current Research Groups",
+            "Research Groups",
+            "IRTF Groups",
+          ].freeze
           def fetch
-            groups = []
             log "Fetching IRTF groups..."
-            begin
-              doc = fetch_html(BASE_URL)
-              return [] unless doc
-              # First try to extract from the dropdown menu
-              dropdown_groups = extract_from_dropdown(doc)
-              if dropdown_groups.any?
-                log "Found #{dropdown_groups.size} groups in dropdown menu", 1
-                groups.concat(dropdown_groups)
-                return groups
-              end
-              # If dropdown extraction fails, fall back to traditional section-based extraction
-              # Debug the page structure
-              headings = doc.css('h3').map(&:text).join(', ')
-              log "Found headings on IRTF page: #{headings}", 1
-              # Extract active groups
-              active_groups = extract_groups(doc, 'Active Research Groups', 'active')
-              log "Found #{active_groups.size} active IRTF groups", 1
-              # Extract concluded groups
-              concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
-              log "Found #{concluded_groups.size} concluded IRTF groups", 1
-              groups.concat(active_groups)
-              groups.concat(concluded_groups)
-              # If still no groups found, try alternative selectors
-              if groups.empty?
-                log "No groups found with standard selectors, trying alternatives...", 1
-                # Try different section titles
-                ['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
-                  section_groups = extract_groups(doc, title, 'active')
-                  if section_groups.any?
-                    log "Found #{section_groups.size} groups with section title: #{title}", 1
-                    groups.concat(section_groups)
-                  end
-                end
-                # Try a more generic approach if still no groups
-                if groups.empty?
-                  log "Using generic list item selector...", 1
-                  # Find any unordered list with links
-                  doc.css('ul').each do |list|
-                    if list.css('li a').any?
-                      generic_groups = extract_groups_from_list(list, 'active')
-                      if generic_groups.any?
-                        log "Found #{generic_groups.size} groups using generic list selector", 1
-                        groups.concat(generic_groups)
-                      end
-                    end
-                  end
-                end
-              end
-            rescue => e
-              log "Error fetching IRTF groups: #{e.message}", 1
-            end
+            doc = fetch_html(BASE_URL)
+            return build_collection([]) unless doc
+            groups = extract_from_dropdown(doc)
+            return build_collection(groups) if groups.any?
-            groups
+            log "Dropdown extraction empty, falling back to section parsing", 1
+            build_collection(extract_from_sections(doc))
+          rescue StandardError => e
+            log "Error fetching IRTF groups: #{e.message}", 1
+            build_collection([])
           end
-          # Extract groups from the dropdown menu
-          # @param doc [Nokogiri::HTML::Document] The HTML document
-          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
-          def extract_from_dropdown(doc)
-            groups = []
+          private
-            # Look for the dropdown menu containing research groups
-            dropdown = doc.css('a.dropdown-toggle').find do |el|
-              el.text.include?('Research Groups')
+          def extract_from_dropdown(doc)
+            dropdown = doc.css("a.dropdown-toggle").find do |el|
+              el.text.include?("Research Groups")
             end
             return [] unless dropdown
-            # Find the dropdown menu
-            dropdown_parent = dropdown.parent
-            dropdown_menu = dropdown_parent.css('.dropdown-menu')
-            return [] unless dropdown_menu.any?
+            menu = dropdown.parent.css(".dropdown-menu")
+            return [] unless menu.any?
             log "Found dropdown menu with research groups", 1
-            # Extract groups from the dropdown menu
-            dropdown_menu.css('a.dropdown-item').each do |link|
-              next unless link && link['href']
+            menu.css("a.dropdown-item").filter_map do |link|
+              next unless link && link["href"]
-              name = link.text.strip
-              href = link['href']
-              # Extract abbreviation from href (e.g., cfrg.html -> CFRG)
-              if href =~ /(\w+)\.html$/
-                abbreviation = $1.upcase
-              else
-                next # Skip if we can't determine abbreviation
-              end
-              # Construct full URL if it's a relative path
-              details_url = href
-              if !details_url.start_with?('http')
-                if details_url.start_with?('/')
-                  details_url = "https://www.irtf.org#{details_url}"
-                else
-                  details_url = "https://www.irtf.org/#{details_url}"
-                end
-              end
-              begin
-                details = fetch_group_details(details_url)
-                group = Importer::Group.new(
-                  abbreviation: abbreviation,
-                  name: name,
-                  organization: 'irtf',
-                  type: 'rg',
-                  area: nil,
-                  status: 'active', # Assume active since it's in the menu
-                  description: nil, # Will be populated from details page if available
-                  chairs: details[:chairs],
-                  mailing_list: details[:mailing_list],
-                  mailing_list_archive: details[:mailing_list_archive],
-                  website_url: details_url,
-                  charter_url: details[:charter_url],
-                  concluded_date: details[:concluded_date]
-                )
-                groups << group
-              rescue => e
-                log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
-              end
-            end
+              abbreviation = extract_abbreviation_from_href(link["href"])
+              next unless abbreviation
-            groups
+              details_url = resolve_url(link["href"])
+              details = fetch_group_details(details_url)
+              build_group(
+                abbreviation: abbreviation,
+                name: link.text.strip,
+                organization: "irtf",
+                type: "rg",
+                status: "active",
+                website_url: details_url,
+                **details,
+              )
+            rescue StandardError => e
+              log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
+                  2
+              nil
+            end
           end
-          private
+          def extract_from_sections(doc)
+            log "Found headings: #{doc.css('h3').map(&:text).join(', ')}", 1
-          # Extract groups from a section on the IRTF page
-          # @param doc [Nokogiri::HTML::Document] The HTML document
-          # @param section_title [String] The title of the section to extract from
-          # @param status [String] The status of the groups in this section (active/concluded)
-          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
-          def extract_groups(doc, section_title, status)
-            groups = []
-            section = doc.xpath("//h3[contains(text(), '#{section_title}')]/following-sibling::ul[1]")
+            active = extract_from_section(doc, "Active Research Groups",
+                                          "active")
+            log "Found #{active.size} active IRTF groups", 1
-            section.css('li').each do |group_item|
-              link = group_item.at_css('a')
-              next unless link
+            concluded = extract_from_section(doc, "Concluded Research Groups",
+                                             "concluded")
+            log "Found #{concluded.size} concluded IRTF groups", 1
-              name = link.text.strip
-              abbreviation = nil
+            groups = active + concluded
+            return groups if groups.any?
-              # Extract abbreviation from the text (typically in parentheses)
-              if name =~ /\(([^)]+)\)/
-                abbreviation = $1
-              end
-              # If unable to extract abbreviation, try from the URL
-              if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
-                abbreviation = $1.upcase
-              end
+            log "No groups found with standard selectors, trying alternatives...",
+                1
+            extract_from_fallback_sections(doc)
+          end
-              next unless abbreviation
+          def extract_from_section(doc, title, status)
+            section = doc.xpath("//h3[contains(text(), '#{title}')]/following-sibling::ul[1]")
+            extract_groups_from_list(section, status)
+          end
-              # Extract description (text after the link)
-              description = group_item.text.sub(link.text, '').strip
-              # Remove parenthesized abbreviation from description
-              description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
-              # Get details from the group's page
-              details_url = link['href']
-              begin
-                details = fetch_group_details(details_url)
-                group = Importer::Group.new(
-                  abbreviation: abbreviation,
-                  name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
-                  organization: 'irtf',
-                  type: 'rg',
-                  area: nil,
-                  status: status,
-                  description: description,
-                  chairs: details[:chairs],
-                  mailing_list: details[:mailing_list],
-                  mailing_list_archive: details[:mailing_list_archive],
-                  website_url: details_url,
-                  charter_url: details[:charter_url],
-                  concluded_date: details[:concluded_date]
-                )
-                groups << group
-              rescue => e
-                log "Error fetching details for #{abbreviation}: #{e.message}", 2
-              end
+          def extract_from_fallback_sections(doc)
+            SECTION_TITLES.each do |title|
+              groups = extract_from_section(doc, title, "active")
+              return groups if groups.any?
             end
-            groups
+            doc.css("ul").flat_map do |list|
+              next [] unless list.css("li a").any?
+              extract_groups_from_list(list, "active")
+            end
           end
-          # Helper method to extract groups from any list without requiring a specific section heading
-          # @param list_element [Nokogiri::XML::Element] The list element to extract from
-          # @param status [String] The status of the groups in this list (active/concluded)
-          # @return [Array<Ietf::Data::Importer::Group>] Array of Group objects
           def extract_groups_from_list(list_element, status)
-            groups = []
-            list_element.css('li').each do |group_item|
-              link = group_item.at_css('a')
-              next unless link && link['href']
+            list_element.css("li").filter_map do |item|
+              link = item.at_css("a")
+              next unless link && link["href"]
               name = link.text.strip
-              abbreviation = nil
-              # Extract abbreviation from the text (typically in parentheses)
-              if name =~ /\(([^)]+)\)/
-                abbreviation = $1
-              end
-              # If unable to extract abbreviation, try from the URL
-              if abbreviation.nil? && link['href'] =~ %r{/(\w+)/?$}
-                abbreviation = $1.upcase
-              end
-              next unless abbreviation && !abbreviation.empty?
-              # Extract description (text after the link)
-              description = group_item.text.sub(link.text, '').strip
-              # Remove parenthesized abbreviation from description
-              description = description.sub(/\s*\([^)]+\)\s*/, ' ').strip
-              # Get details from the group's page
-              details_url = link['href']
-              # Ensure we have a full URL
-              if !details_url.start_with?('http')
-                if details_url.start_with?('/')
-                  details_url = "https://www.irtf.org#{details_url}"
-                else
-                  details_url = "https://www.irtf.org/#{details_url}"
-                end
-              end
-              begin
-                details = fetch_group_details(details_url)
-                group = Importer::Group.new(
-                  abbreviation: abbreviation,
-                  name: name.sub(/\s*\([^)]+\)\s*/, '').strip,
-                  organization: 'irtf',
-                  type: 'rg',
-                  area: nil,
-                  status: status,
-                  description: description,
-                  chairs: details[:chairs],
-                  mailing_list: details[:mailing_list],
-                  mailing_list_archive: details[:mailing_list_archive],
-                  website_url: details_url,
-                  charter_url: details[:charter_url],
-                  concluded_date: details[:concluded_date]
-                )
-                groups << group
-              rescue => e
-                log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
-              end
+              abbreviation = extract_abbreviation(name, link["href"])
+              next unless abbreviation
+              description = extract_description(item, link)
+              details_url = resolve_url(link["href"])
+              details = fetch_group_details(details_url)
+              build_group(
+                abbreviation: abbreviation,
+                name: name.sub(/\s*\([^)]+\)\s*/, "").strip,
+                organization: "irtf",
+                type: "rg",
+                status: status,
+                description: description,
+                website_url: details_url,
+                **details,
+              )
+            rescue StandardError => e
+              log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}",
+                  2
+              nil
             end
+          end
-            groups
+          def extract_abbreviation(name, href)
+            if name =~ /\(([^)]+)\)/
+              $1
+            elsif href =~ %r{/(\w+)/?$}
+              $1.upcase
+            end
           end
-          # Fetch details for a specific IRTF group from its page
-          # @param url [String] The URL of the group's page
-          # @return [Hash] Hash of group details
-          def fetch_group_details(url)
-            details = {
-              chairs: [],
-              mailing_list: nil,
-              mailing_list_archive: nil,
-              charter_url: nil,
-              concluded_date: nil
-            }
+          def extract_abbreviation_from_href(href)
+            $1.upcase if href =~ /(\w+)\.html$/
+          end
-            doc = fetch_html(url)
-            return details unless doc
+          def extract_description(item, link)
+            item.text.sub(link.text, "").sub(/\s*\([^)]+\)\s*/, " ").strip
+          end
-            # Extract chairs
-            chair_section = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
-            if chair_section
-              details[:chairs] << chair_section.text.strip
+          def resolve_url(href)
+            case href
+            when %r{\Ahttps?://} then href
+            when %r{\A/} then "https://www.irtf.org#{href}"
+            else "https://www.irtf.org/#{href}"
             end
+          end
-            # Extract mailing list
-            mailing_list = doc.at_css('a[href^="mailto:"]')
-            if mailing_list
-              details[:mailing_list] = mailing_list['href'].sub('mailto:', '')
-            end
+          def fetch_group_details(url)
+            doc = fetch_html(url)
+            return {} unless doc
+            {
+              chairs: extract_chairs(doc),
+              mailing_list: doc.at_css('a[href^="mailto:"]')&.[]("href")&.sub(
+                "mailto:", ""
+              ),
+              mailing_list_archive: doc.at_css('a[href*="mailarchive.ietf.org"]')&.[]("href"),
+              charter_url: extract_charter_url(doc, url),
+              concluded_date: extract_concluded_date(doc, url),
+            }
+          end
-            # Extract mailing list archive
-            archive = doc.at_css('a[href*="mailarchive.ietf.org"]')
-            if archive
-              details[:mailing_list_archive] = archive['href']
-            end
+          def extract_chairs(doc)
+            chair = doc.xpath("//h3[contains(text(), 'Chair')]/following-sibling::p[1]")
+            chair ? [chair.text.strip] : []
+          end
-            # Extract charter URL
-            charter_link = doc.at_css('a[href*="charter"]')
-            if charter_link
-              details[:charter_url] = URI.join(url, charter_link['href']).to_s
-            end
+          def extract_charter_url(doc, base_url)
+            link = doc.at_css('a[href*="charter"]')
+            URI.join(base_url, link["href"]).to_s if link
+          end
-            # Extract concluded date from the page or the URL
-            if url.include?('/concluded/')
-              concluded_info = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
-              if concluded_info
-                begin
-                  details[:concluded_date] = Date.parse(concluded_info[1])
-                rescue
-                  # Just leave it as nil if we can't parse it
-                end
-              end
-            end
+          def extract_concluded_date(doc, url)
+            return nil unless url.include?("/concluded/")
-            details
+            match = doc.text.match(/concluded in\s+([A-Z][a-z]+\s+\d{4})/)
+            Date.parse(match[1]) if match
+          rescue Date::Error
+            nil
           end
         end
       end

data/lib/ietf/data/importer/scrapers.rb CHANGED Viewed

@@ -3,61 +3,33 @@
 require_relative "scrapers/base_scraper"
 require_relative "scrapers/ietf_scraper"
 require_relative "scrapers/irtf_scraper"
-require_relative "group_collection"
 module Ietf
   module Data
     module Importer
-      # Module for IETF/IRTF web scrapers
       module Scrapers
-        # Fetch all IETF and IRTF groups
-        # @return [Ietf::Data::Importer::GroupCollection] Collection of all groups
         def self.fetch_all
           puts "Starting to fetch IETF and IRTF group data..."
-          # Fetch IETF groups
-          ietf_groups = IetfScraper.new.fetch
-          puts "Fetched #{ietf_groups.size} IETF groups"
+          ietf = fetch_ietf
+          puts "Fetched #{ietf.size} IETF groups"
-          # Fetch IRTF groups
-          irtf_groups = IrtfScraper.new.fetch
-          puts "Fetched #{irtf_groups.size} IRTF groups"
+          irtf = fetch_irtf
+          puts "Fetched #{irtf.size} IRTF groups"
-          # Combine all groups and return as a collection
-          all_groups = ietf_groups + irtf_groups
-          puts "Total: #{all_groups.size} groups"
+          merged = ietf.merge(irtf)
+          puts "Total: #{merged.size} groups"
-          Importer::GroupCollection.new(groups: all_groups)
+          merged
         end
-        # Fetch IETF groups only
-        # @return [Array<Ietf::Data::Importer::Group>] Array of IETF groups
         def self.fetch_ietf
           IetfScraper.new.fetch
         end
-        # Fetch IRTF groups only
-        # @return [Array<Ietf::Data::Importer::Group>] Array of IRTF groups
         def self.fetch_irtf
           IrtfScraper.new.fetch
         end
-        # Save group collection to a file
-        # @param collection [Ietf::Data::Importer::GroupCollection] Group collection to save
-        # @param file_path [String] Path to the output file
-        # @param format [Symbol] Output format (:yaml or :json)
-        def self.save_to_file(collection, file_path, format = :yaml)
-          case format.to_sym
-          when :yaml
-            File.write(file_path, collection.to_yaml)
-          when :json
-            File.write(file_path, collection.to_json)
-          else
-            raise ArgumentError, "Unsupported format: #{format}"
-          end
-          puts "Saved #{collection.groups.size} groups to #{file_path}"
-        end
       end
     end
   end

data/lib/ietf/data/importer/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module Ietf
   module Data
     module Importer
-      VERSION = "0.3.0"
+      VERSION = "0.3.1"
     end
   end
 end