RubyGems - nitfr - Versions diffs - 1.0.0 → 1.1.0 - Mend

nitfr 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/lib/nitfr/exporter.rb ADDED Viewed

@@ -0,0 +1,257 @@
+# frozen_string_literal: true
+module NITFr
+  # Provides export functionality for NITF documents
+  #
+  # Supports conversion to Markdown, plain text, and HTML formats.
+  module Exporter
+    # Convert document to Markdown format
+    #
+    # @return [String] Markdown representation of the document
+    def to_markdown
+      lines = []
+      # Title/Headline
+      if headline
+        lines << "# #{headline}"
+        lines << ""
+      end
+      # Byline
+      if byline&.text
+        lines << "*#{byline.text}*"
+        lines << ""
+      end
+      # Dateline
+      if body&.dateline
+        lines << "**#{body.dateline}**"
+        lines << ""
+      end
+      # Abstract
+      if body&.abstract
+        lines << "> #{body.abstract}"
+        lines << ""
+      end
+      # Paragraphs
+      paragraphs.each do |para|
+        lines << format_paragraph_markdown(para)
+        lines << ""
+      end
+      # Block quotes
+      body&.block_quotes&.each do |quote|
+        lines << "> #{quote}"
+        lines << ""
+      end
+      # Footnotes
+      if footnotes.any?
+        lines << "---"
+        lines << ""
+        footnotes.each do |fn|
+          label = fn.label || "*"
+          lines << "[#{label}]: #{fn.value}"
+        end
+        lines << ""
+      end
+      lines.join("\n").strip
+    end
+    # Convert document to plain text format
+    #
+    # @return [String] plain text representation of the document
+    def to_text
+      lines = []
+      # Title/Headline
+      if headline
+        lines << headline.upcase
+        lines << "=" * headline.length
+        lines << ""
+      end
+      # Byline
+      if byline&.text
+        lines << byline.text
+        lines << ""
+      end
+      # Dateline
+      if body&.dateline
+        lines << body.dateline
+        lines << ""
+      end
+      # Paragraphs
+      paragraphs.each do |para|
+        lines << para.text
+        lines << ""
+      end
+      # Block quotes
+      body&.block_quotes&.each do |quote|
+        lines << "  \"#{quote}\""
+        lines << ""
+      end
+      # Footnotes
+      if footnotes.any?
+        lines << "-" * 40
+        lines << ""
+        footnotes.each do |fn|
+          label = fn.label || "*"
+          lines << "[#{label}] #{fn.value}"
+        end
+        lines << ""
+      end
+      lines.join("\n").strip
+    end
+    # Convert document to HTML format
+    #
+    # @param include_wrapper [Boolean] whether to include html/body tags (default: false)
+    # @return [String] HTML representation of the document
+    def to_html(include_wrapper: false)
+      html_parts = []
+      # Article container
+      html_parts << "<article>"
+      # Header section
+      html_parts << "  <header>"
+      if headline
+        html_parts << "    <h1>#{escape_html(headline)}</h1>"
+      end
+      if byline&.text
+        html_parts << "    <p class=\"byline\">#{escape_html(byline.text)}</p>"
+      end
+      if body&.dateline
+        html_parts << "    <p class=\"dateline\">#{escape_html(body.dateline)}</p>"
+      end
+      html_parts << "  </header>"
+      # Abstract
+      if body&.abstract
+        html_parts << "  <aside class=\"abstract\">"
+        html_parts << "    <p>#{escape_html(body.abstract)}</p>"
+        html_parts << "  </aside>"
+      end
+      # Main content
+      html_parts << "  <section class=\"content\">"
+      paragraphs.each do |para|
+        html_parts << format_paragraph_html(para)
+      end
+      # Block quotes
+      body&.block_quotes&.each do |quote|
+        html_parts << "    <blockquote>"
+        html_parts << "      <p>#{escape_html(quote)}</p>"
+        html_parts << "    </blockquote>"
+      end
+      html_parts << "  </section>"
+      # Footnotes
+      if footnotes.any?
+        html_parts << "  <footer class=\"footnotes\">"
+        html_parts << "    <ol>"
+        footnotes.each do |fn|
+          id_attr = fn.id ? " id=\"#{escape_html(fn.id)}\"" : ""
+          html_parts << "      <li#{id_attr}>#{escape_html(fn.value)}</li>"
+        end
+        html_parts << "    </ol>"
+        html_parts << "  </footer>"
+      end
+      html_parts << "</article>"
+      content = html_parts.join("\n")
+      if include_wrapper
+        wrap_html(content)
+      else
+        content
+      end
+    end
+    private
+    def format_paragraph_markdown(para)
+      text = para.text
+      # Add emphasis markers
+      para.emphasis.each do |em|
+        text = text.gsub(em, "*#{em}*")
+      end
+      # Add strong markers
+      para.strong.each do |strong|
+        text = text.gsub(strong, "**#{strong}**")
+      end
+      text
+    end
+    def format_paragraph_html(para)
+      text = escape_html(para.text)
+      # Convert line breaks to <br>
+      text = text.gsub("\n", "<br>\n")
+      # Add emphasis tags
+      para.emphasis.each do |em|
+        escaped = escape_html(em)
+        text = text.gsub(escaped, "<em>#{escaped}</em>")
+      end
+      # Add strong tags
+      para.strong.each do |strong|
+        escaped = escape_html(strong)
+        text = text.gsub(escaped, "<strong>#{escaped}</strong>")
+      end
+      classes = []
+      classes << "lead" if para.lead?
+      class_attr = classes.any? ? " class=\"#{classes.join(' ')}\"" : ""
+      "    <p#{class_attr}>#{text}</p>"
+    end
+    def escape_html(text)
+      return "" if text.nil?
+      text.to_s
+          .gsub("&", "&amp;")
+          .gsub("<", "&lt;")
+          .gsub(">", "&gt;")
+          .gsub('"', "&quot;")
+    end
+    def wrap_html(content)
+      <<~HTML
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+          <meta charset="UTF-8">
+          <meta name="viewport" content="width=device-width, initial-scale=1.0">
+          <title>#{escape_html(title || headline || 'NITF Document')}</title>
+        </head>
+        <body>
+        #{content}
+        </body>
+        </html>
+      HTML
+    end
+  end
+end

data/lib/nitfr/footnote.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+module NITFr
+  # Represents a footnote from an NITF document
+  #
+  # Footnotes can appear in body.content or body.end and contain
+  # a label (reference marker) and value (the footnote text).
+  class Footnote
+    attr_reader :node
+    def initialize(node)
+      @node = node
+    end
+    # Get the footnote ID
+    #
+    # @return [String, nil] the footnote ID attribute
+    def id
+      node.attributes["id"]
+    end
+    # Get the footnote label (reference marker)
+    #
+    # @return [String, nil] the label text (e.g., "1", "*", "a")
+    def label
+      @label ||= xpath_text("fn-label")
+    end
+    # Get the footnote value (content)
+    #
+    # @return [String, nil] the footnote text content
+    def value
+      @value ||= xpath_text("fn-value")
+    end
+    alias text value
+    alias content value
+    # Check if footnote has content
+    #
+    # @return [Boolean] true if footnote has a value
+    def present?
+      !value.nil? && !value.empty?
+    end
+    # Convert footnote to a Hash representation
+    #
+    # @return [Hash] the footnote as a hash
+    def to_h
+      {
+        id: id,
+        label: label,
+        value: value
+      }.compact
+    end
+    private
+    def xpath_text(path)
+      element = REXML::XPath.first(node, path)
+      element&.text&.strip
+    end
+  end
+end

data/lib/nitfr/head.rb CHANGED Viewed

@@ -60,6 +60,20 @@ module NITFr
       end
     end
+    # Convert head to a Hash representation
+    #
+    # @return [Hash] the head as a hash
+    def to_h
+      {
+        title: title,
+        meta: meta.empty? ? nil : meta,
+        keywords: keywords.empty? ? nil : keywords,
+        pubdata: pubdata.empty? ? nil : pubdata,
+        revision_history: revision_history.empty? ? nil : revision_history,
+        docdata: docdata&.to_h
+      }.compact
+    end
     private
     def xpath_first(path)

data/lib/nitfr/headline.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module NITFr
   # Represents headline information from an NITF document
   #
-  # NITF supports multiple headline levels (hl1, hl2) as well as
+  # NITF supports multiple headline levels (hl1 through hl5) as well as
   # headline (alternate headline) elements.
   class Headline
     attr_reader :node
@@ -28,11 +28,35 @@ module NITFr
     end
     alias hl2 secondary
+    # Get the tertiary headline (hl3)
+    #
+    # @return [String, nil] the tertiary headline text
+    def tertiary
+      @tertiary ||= xpath_first("hl3")&.text&.strip
+    end
+    alias hl3 tertiary
+    # Get the quaternary headline (hl4)
+    #
+    # @return [String, nil] the quaternary headline text
+    def quaternary
+      @quaternary ||= xpath_first("hl4")&.text&.strip
+    end
+    alias hl4 quaternary
+    # Get the quinary headline (hl5)
+    #
+    # @return [String, nil] the quinary headline text
+    def quinary
+      @quinary ||= xpath_first("hl5")&.text&.strip
+    end
+    alias hl5 quinary
     # Get all headline levels as an array
     #
     # @return [Array<String>] array of headline texts in order
     def all
-      @all ||= [primary, secondary].compact
+      @all ||= [primary, secondary, tertiary, quaternary, quinary].compact
     end
     # Get the full headline text (all levels joined)
@@ -46,7 +70,20 @@ module NITFr
     #
     # @return [Boolean] true if any headline text exists
     def present?
-      !primary.nil? || !secondary.nil?
+      all.any?
+    end
+    # Convert headline to a Hash representation
+    #
+    # @return [Hash] the headline as a hash
+    def to_h
+      {
+        primary: primary,
+        secondary: secondary,
+        tertiary: tertiary,
+        quaternary: quaternary,
+        quinary: quinary
+      }.compact
     end
     private

data/lib/nitfr/media.rb CHANGED Viewed

@@ -126,6 +126,24 @@ module NITFr
       }.compact
     end
+    # Convert media to a Hash representation
+    #
+    # @return [Hash] the media as a hash
+    def to_h
+      {
+        type: type,
+        source: source,
+        mime_type: mime_type,
+        width: width,
+        height: height,
+        alt_text: alt_text,
+        caption: caption,
+        credit: credit,
+        metadata: metadata.empty? ? nil : metadata,
+        references: references.size > 1 ? references : nil
+      }.compact
+    end
     private
     def xpath_first(path)

data/lib/nitfr/paragraph.rb CHANGED Viewed

@@ -11,6 +11,7 @@ module NITFr
   # arrays on first access to any entity method.
   class Paragraph
     include TextExtractor
+    include SearchPattern
     attr_reader :node
@@ -48,7 +49,7 @@ module NITFr
       lede == "true" || lede == "yes"
     end
-    # Get any emphasized text within the paragraph
+    # Get any emphasized text within the paragraph (em tags)
     #
     # @return [Array<String>] array of emphasized text
     def emphasis
@@ -56,6 +57,14 @@ module NITFr
       @emphasis
     end
+    # Get any strong/bold text within the paragraph (strong tags)
+    #
+    # @return [Array<String>] array of strong text
+    def strong
+      extract_entities unless @entities_extracted
+      @strong
+    end
     # Get any links within the paragraph
     #
     # @return [Array<Hash>] array of link info hashes
@@ -111,6 +120,107 @@ module NITFr
       text.split(/\s+/).size
     end
+    # =========================================================================
+    # Search Helper Methods
+    # =========================================================================
+    # Check if paragraph contains the given text
+    #
+    # @param query [String, Regexp] the search query
+    # @param case_sensitive [Boolean] whether search is case-sensitive (default: false)
+    # @return [Boolean] true if text is found
+    def contains?(query, case_sensitive: false)
+      pattern = build_search_pattern(query, case_sensitive)
+      text.match?(pattern)
+    end
+    # Check if paragraph mentions a specific person
+    #
+    # @param name [String] the person name to search for
+    # @param exact [Boolean] if true, requires exact match (default: false)
+    # @return [Boolean] true if person is mentioned
+    def mentions_person?(name, exact: false)
+      entity_match?(people, name, exact)
+    end
+    # Check if paragraph mentions a specific organization
+    #
+    # @param name [String] the organization name to search for
+    # @param exact [Boolean] if true, requires exact match (default: false)
+    # @return [Boolean] true if organization is mentioned
+    def mentions_org?(name, exact: false)
+      entity_match?(organizations, name, exact)
+    end
+    # Check if paragraph mentions a specific location
+    #
+    # @param name [String] the location name to search for
+    # @param exact [Boolean] if true, requires exact match (default: false)
+    # @return [Boolean] true if location is mentioned
+    def mentions_location?(name, exact: false)
+      entity_match?(locations, name, exact)
+    end
+    # Check if paragraph mentions any of the given entities
+    #
+    # @param person [String, nil] person name to check
+    # @param org [String, nil] organization name to check
+    # @param location [String, nil] location name to check
+    # @return [Boolean] true if any specified entity is mentioned
+    def mentions?(person: nil, org: nil, location: nil)
+      return false if person.nil? && org.nil? && location.nil?
+      (person && mentions_person?(person)) ||
+        (org && mentions_org?(org)) ||
+        (location && mentions_location?(location))
+    end
+    # Check if paragraph has any links
+    #
+    # @return [Boolean] true if paragraph contains links
+    def has_links?
+      links.any?
+    end
+    # Check if paragraph has any emphasis
+    #
+    # @return [Boolean] true if paragraph contains emphasized text
+    def has_emphasis?
+      emphasis.any?
+    end
+    # Check if paragraph has any strong/bold text
+    #
+    # @return [Boolean] true if paragraph contains strong text
+    def has_strong?
+      strong.any?
+    end
+    # Check if paragraph mentions any entities
+    #
+    # @return [Boolean] true if paragraph contains any person, org, or location references
+    def has_entities?
+      people.any? || organizations.any? || locations.any?
+    end
+    # Convert paragraph to a Hash representation
+    #
+    # @return [Hash] the paragraph as a hash
+    def to_h
+      {
+        id: id,
+        text: text,
+        lead: lead? || nil,
+        word_count: word_count,
+        people: people.empty? ? nil : people,
+        organizations: organizations.empty? ? nil : organizations,
+        locations: locations.empty? ? nil : locations,
+        emphasis: emphasis.empty? ? nil : emphasis,
+        strong: strong.empty? ? nil : strong,
+        links: links.empty? ? nil : links
+      }.compact
+    end
     private
     # Extract all entities in a single DOM traversal
@@ -122,6 +232,7 @@ module NITFr
       @organizations = []
       @locations = []
       @emphasis = []
+      @strong = []
       @links = []
       traverse_for_entities(node)
@@ -147,6 +258,9 @@ module NITFr
         when "em"
           text = child.text&.strip
           @emphasis << text if text && !text.empty?
+        when "strong"
+          text = child.text&.strip
+          @strong << text if text && !text.empty?
         when "a"
           @links << {
             text: child.text&.strip,
@@ -158,5 +272,20 @@ module NITFr
         traverse_for_entities(child)
       end
     end
+    # Check if any entity matches the given name
+    #
+    # @param entities [Array<String>] array of entity names
+    # @param name [String] name to search for
+    # @param exact [Boolean] require exact match
+    # @return [Boolean] true if match found
+    def entity_match?(entities, name, exact)
+      if exact
+        entities.any? { |e| e == name }
+      else
+        pattern = /#{Regexp.escape(name)}/i
+        entities.any? { |e| e.match?(pattern) }
+      end
+    end
   end
 end

data/lib/nitfr/search_pattern.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module NITFr
+  # Shared module for building search patterns from queries
+  #
+  # Provides consistent pattern building across Document and Paragraph
+  # search methods, with proper escaping and case sensitivity handling.
+  module SearchPattern
+    private
+    # Build a regex pattern from query
+    #
+    # @param query [String, Regexp] the search query
+    # @param case_sensitive [Boolean] whether search is case-sensitive
+    # @return [Regexp] compiled pattern
+    def build_search_pattern(query, case_sensitive)
+      if query.is_a?(Regexp)
+        if case_sensitive
+          query
+        else
+          # Preserve original flags while adding case insensitivity
+          Regexp.new(query.source, query.options | Regexp::IGNORECASE)
+        end
+      else
+        Regexp.new(Regexp.escape(query.to_s), case_sensitive ? nil : Regexp::IGNORECASE)
+      end
+    end
+  end
+end

data/lib/nitfr/text_extractor.rb CHANGED Viewed

@@ -6,9 +6,14 @@ module NITFr
   # REXML's built-in text method only returns direct text content,
   # not text from nested elements. This module provides a method
   # to recursively extract all text content.
+  #
+  # Preserves hard line breaks (<br/>) as newline characters.
   module TextExtractor
     # Extract all text content from an element and its descendants
     #
+    # Converts <br/> elements to newline characters to preserve
+    # intended line breaks within content.
+    #
     # @param element [REXML::Element] the element to extract text from
     # @return [String] the concatenated text content
     def extract_all_text(element)
@@ -17,7 +22,12 @@ module NITFr
         if child.is_a?(REXML::Text)
           result << child.value
         elsif child.is_a?(REXML::Element)
-          result << extract_all_text(child)
+          # Convert <br/> to newline
+          if child.name == "br"
+            result << "\n"
+          else
+            result << extract_all_text(child)
+          end
         end
       end
       result

data/lib/nitfr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module NITFr
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
 end

data/lib/nitfr.rb CHANGED Viewed

@@ -13,6 +13,8 @@ end
 require_relative "nitfr/version"
 require_relative "nitfr/errors"
 require_relative "nitfr/text_extractor"
+require_relative "nitfr/search_pattern"
+require_relative "nitfr/exporter"
 require_relative "nitfr/document"
 require_relative "nitfr/head"
 require_relative "nitfr/body"
@@ -20,6 +22,7 @@ require_relative "nitfr/headline"
 require_relative "nitfr/byline"
 require_relative "nitfr/paragraph"
 require_relative "nitfr/media"
+require_relative "nitfr/footnote"
 require_relative "nitfr/docdata"
 module NITFr