nitfr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents a paragraph from an NITF document body
5
+ #
6
+ # Paragraphs can contain inline elements like emphasis,
7
+ # links, and other markup.
8
+ #
9
+ # Entity extraction (people, organizations, locations, emphasis) uses
10
+ # lazy batch extraction - a single DOM traversal populates all entity
11
+ # arrays on first access to any entity method.
12
+ class Paragraph
13
+ include TextExtractor
14
+
15
+ attr_reader :node
16
+
17
+ def initialize(node)
18
+ @node = node
19
+ @entities_extracted = false
20
+ end
21
+
22
+ # Get the plain text content of the paragraph
23
+ #
24
+ # @return [String] the paragraph text with inline elements stripped
25
+ def text
26
+ @text ||= extract_all_text(node).strip
27
+ end
28
+ alias to_s text
29
+
30
+ # Get the paragraph ID if present
31
+ #
32
+ # @return [String, nil] the paragraph ID
33
+ def id
34
+ node.attributes["id"]
35
+ end
36
+
37
+ # Get the paragraph's lede attribute (indicates lead paragraph)
38
+ #
39
+ # @return [String, nil] the lede value
40
+ def lede
41
+ node.attributes["lede"]
42
+ end
43
+
44
+ # Check if this is a lead paragraph
45
+ #
46
+ # @return [Boolean] true if marked as lead
47
+ def lead?
48
+ lede == "true" || lede == "yes"
49
+ end
50
+
51
+ # Get any emphasized text within the paragraph
52
+ #
53
+ # @return [Array<String>] array of emphasized text
54
+ def emphasis
55
+ extract_entities unless @entities_extracted
56
+ @emphasis
57
+ end
58
+
59
+ # Get any links within the paragraph
60
+ #
61
+ # @return [Array<Hash>] array of link info hashes
62
+ def links
63
+ extract_entities unless @entities_extracted
64
+ @links
65
+ end
66
+
67
+ # Get any person references in the paragraph
68
+ #
69
+ # @return [Array<String>] array of person names
70
+ def people
71
+ extract_entities unless @entities_extracted
72
+ @people
73
+ end
74
+
75
+ # Get any organization references in the paragraph
76
+ #
77
+ # @return [Array<String>] array of organization names
78
+ def organizations
79
+ extract_entities unless @entities_extracted
80
+ @organizations
81
+ end
82
+
83
+ # Get any location references in the paragraph
84
+ #
85
+ # @return [Array<String>] array of location names
86
+ def locations
87
+ extract_entities unless @entities_extracted
88
+ @locations
89
+ end
90
+
91
+ # Get the raw HTML/XML content of the paragraph
92
+ #
93
+ # @return [String] the inner XML
94
+ def inner_html
95
+ node.children.map(&:to_s).join
96
+ end
97
+
98
+ # Check if paragraph has content
99
+ #
100
+ # @return [Boolean] true if paragraph has text
101
+ def present?
102
+ !text.empty?
103
+ end
104
+
105
+ # Get word count for the paragraph
106
+ #
107
+ # @return [Integer] approximate word count
108
+ def word_count
109
+ return 0 if text.empty?
110
+
111
+ text.split(/\s+/).size
112
+ end
113
+
114
+ private
115
+
116
+ # Extract all entities in a single DOM traversal
117
+ #
118
+ # This is more efficient than running separate XPath queries
119
+ # for each entity type when multiple entity methods are called.
120
+ def extract_entities
121
+ @people = []
122
+ @organizations = []
123
+ @locations = []
124
+ @emphasis = []
125
+ @links = []
126
+
127
+ traverse_for_entities(node)
128
+
129
+ @entities_extracted = true
130
+ end
131
+
132
+ # Recursively traverse elements and extract entities
133
+ #
134
+ # @param element [REXML::Element] the element to traverse
135
+ def traverse_for_entities(element)
136
+ element.each_element do |child|
137
+ case child.name
138
+ when "person"
139
+ text = child.text&.strip
140
+ @people << text if text && !text.empty?
141
+ when "org"
142
+ text = child.text&.strip
143
+ @organizations << text if text && !text.empty?
144
+ when "location"
145
+ text = child.text&.strip
146
+ @locations << text if text && !text.empty?
147
+ when "em"
148
+ text = child.text&.strip
149
+ @emphasis << text if text && !text.empty?
150
+ when "a"
151
+ @links << {
152
+ text: child.text&.strip,
153
+ href: child.attributes["href"]
154
+ }
155
+ end
156
+
157
+ # Continue traversing for nested entities
158
+ traverse_for_entities(child)
159
+ end
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Shared module for extracting text from REXML elements
5
+ #
6
+ # REXML's built-in text method only returns direct text content,
7
+ # not text from nested elements. This module provides a method
8
+ # to recursively extract all text content.
9
+ module TextExtractor
10
+ # Extract all text content from an element and its descendants
11
+ #
12
+ # @param element [REXML::Element] the element to extract text from
13
+ # @return [String] the concatenated text content
14
+ def extract_all_text(element)
15
+ result = +""
16
+ element.each_child do |child|
17
+ if child.is_a?(REXML::Text)
18
+ result << child.value
19
+ elsif child.is_a?(REXML::Element)
20
+ result << extract_all_text(child)
21
+ end
22
+ end
23
+ result
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ VERSION = "1.0.0"
5
+ end
data/lib/nitfr.rb ADDED
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rexml/document"
4
+ require "rexml/xpath"
5
+
6
+ # Configure REXML security settings at load time
7
+ # This protects against XML entity expansion attacks (Billion Laughs)
8
+ if defined?(REXML::Security)
9
+ REXML::Security.entity_expansion_limit = 100
10
+ REXML::Security.entity_expansion_text_limit = 10_000
11
+ end
12
+
13
+ require_relative "nitfr/version"
14
+ require_relative "nitfr/errors"
15
+ require_relative "nitfr/text_extractor"
16
+ require_relative "nitfr/document"
17
+ require_relative "nitfr/head"
18
+ require_relative "nitfr/body"
19
+ require_relative "nitfr/headline"
20
+ require_relative "nitfr/byline"
21
+ require_relative "nitfr/paragraph"
22
+ require_relative "nitfr/media"
23
+ require_relative "nitfr/docdata"
24
+
25
+ module NITFr
26
+ class << self
27
+ # Parse an NITF XML string and return a Document
28
+ #
29
+ # @param xml [String] the NITF XML content
30
+ # @return [Document] the parsed document
31
+ # @raise [ParseError] if the XML is invalid or not NITF
32
+ def parse(xml)
33
+ Document.new(xml)
34
+ end
35
+
36
+ # Parse an NITF XML file and return a Document
37
+ #
38
+ # @param path [String] path to the NITF XML file
39
+ # @param encoding [String] the file encoding (default: UTF-8)
40
+ # @return [Document] the parsed document
41
+ # @raise [ParseError] if the file cannot be read or XML is invalid
42
+ # @raise [Errno::ENOENT] if the file does not exist
43
+ def parse_file(path, encoding: "UTF-8")
44
+ xml = File.read(path, encoding: encoding)
45
+ parse(xml)
46
+ end
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nitfr
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Mark Turner
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rexml
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '13.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '13.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: test-unit
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '3.6'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.6'
54
+ description: NITFr makes it easy for Ruby applications to parse and extract content
55
+ from NITF XML files, the standard format used in the news industry.
56
+ email:
57
+ - mark@amerine.net
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - LICENSE
63
+ - README.md
64
+ - Rakefile
65
+ - lib/nitfr.rb
66
+ - lib/nitfr/body.rb
67
+ - lib/nitfr/byline.rb
68
+ - lib/nitfr/docdata.rb
69
+ - lib/nitfr/document.rb
70
+ - lib/nitfr/errors.rb
71
+ - lib/nitfr/head.rb
72
+ - lib/nitfr/headline.rb
73
+ - lib/nitfr/media.rb
74
+ - lib/nitfr/paragraph.rb
75
+ - lib/nitfr/text_extractor.rb
76
+ - lib/nitfr/version.rb
77
+ homepage: https://github.com/amerine/nitfr
78
+ licenses:
79
+ - MIT
80
+ metadata:
81
+ homepage_uri: https://github.com/amerine/nitfr
82
+ source_code_uri: https://github.com/amerine/nitfr
83
+ changelog_uri: https://github.com/amerine/nitfr/blob/master/CHANGELOG.md
84
+ rdoc_options: []
85
+ require_paths:
86
+ - lib
87
+ required_ruby_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: 3.0.0
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubygems_version: 3.6.7
99
+ specification_version: 4
100
+ summary: A Ruby gem for parsing NITF (News Industry Text Format) XML files
101
+ test_files: []