nitfr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "date"
4
+
5
+ module NITFr
6
+ # Represents the docdata section of an NITF document head
7
+ #
8
+ # Docdata contains document metadata including IDs, dates,
9
+ # urgency, and other management information.
10
+ class Docdata
11
+ attr_reader :node
12
+
13
+ def initialize(node)
14
+ @node = node
15
+ end
16
+
17
+ # Get the document ID
18
+ #
19
+ # @return [String, nil] the doc-id value
20
+ def doc_id
21
+ @doc_id ||= xpath_first("doc-id")&.attributes&.[]("id-string")
22
+ end
23
+
24
+ # Get the issue date
25
+ #
26
+ # @return [Date, nil] the parsed issue date
27
+ def issue_date
28
+ @issue_date ||= parse_date("date.issue")
29
+ end
30
+
31
+ # Get the release date
32
+ #
33
+ # @return [Date, nil] the parsed release date
34
+ def release_date
35
+ @release_date ||= parse_date("date.release")
36
+ end
37
+
38
+ # Get the expiration date
39
+ #
40
+ # @return [Date, nil] the parsed expire date
41
+ def expire_date
42
+ @expire_date ||= parse_date("date.expire")
43
+ end
44
+
45
+ # Get urgency level (1-8, 1 being most urgent)
46
+ #
47
+ # @return [Integer, nil] the urgency value
48
+ def urgency
49
+ @urgency ||= xpath_first("urgency")&.attributes&.[]("ed-urg")&.to_i
50
+ end
51
+
52
+ # Get the document copyright information
53
+ #
54
+ # @return [Hash] copyright details
55
+ def copyright
56
+ @copyright ||= parse_copyright
57
+ end
58
+
59
+ # Get the copyright holder
60
+ #
61
+ # @return [String, nil] the copyright holder
62
+ def copyright_holder
63
+ copyright[:holder]
64
+ end
65
+
66
+ # Get the copyright year
67
+ #
68
+ # @return [String, nil] the copyright year
69
+ def copyright_year
70
+ copyright[:year]
71
+ end
72
+
73
+ # Get document scope information
74
+ #
75
+ # @return [String, nil] the doc-scope
76
+ def doc_scope
77
+ @doc_scope ||= xpath_first("doc-scope")&.attributes&.[]("scope")
78
+ end
79
+
80
+ # Get series information
81
+ #
82
+ # @return [Hash] series details
83
+ def series
84
+ @series ||= parse_series
85
+ end
86
+
87
+ # Get editorial status/management info
88
+ #
89
+ # @return [Hash] management status
90
+ def management_status
91
+ @management_status ||= parse_management_status
92
+ end
93
+
94
+ # Get the fixture identifier
95
+ #
96
+ # @return [String, nil] the fixture value
97
+ def fixture
98
+ @fixture ||= xpath_first("fixture")&.attributes&.[]("fix-id")
99
+ end
100
+
101
+ # Get all identified content (subjects, organizations, people, etc.)
102
+ #
103
+ # @return [Hash] categorized identified content
104
+ def identified_content
105
+ @identified_content ||= parse_identified_content
106
+ end
107
+
108
+ # Get subject codes/topics
109
+ #
110
+ # @return [Array<String>] array of subjects
111
+ def subjects
112
+ identified_content[:subjects] || []
113
+ end
114
+
115
+ # Get location codes
116
+ #
117
+ # @return [Array<String>] array of locations
118
+ def locations
119
+ identified_content[:locations] || []
120
+ end
121
+
122
+ # Get organization codes
123
+ #
124
+ # @return [Array<String>] array of organizations
125
+ def organizations
126
+ identified_content[:organizations] || []
127
+ end
128
+
129
+ # Get person codes
130
+ #
131
+ # @return [Array<String>] array of people
132
+ def people
133
+ identified_content[:people] || []
134
+ end
135
+
136
+ private
137
+
138
+ def xpath_first(path)
139
+ REXML::XPath.first(node, path)
140
+ end
141
+
142
+ def xpath_match(path)
143
+ REXML::XPath.match(node, path)
144
+ end
145
+
146
+ def parse_date(element_name)
147
+ date_node = xpath_first(element_name)
148
+ return nil unless date_node
149
+
150
+ norm = date_node.attributes["norm"]
151
+ return nil unless norm
152
+
153
+ Date.parse(norm)
154
+ rescue Date::Error
155
+ nil
156
+ end
157
+
158
+ def parse_copyright
159
+ copyright_node = xpath_first("doc.copyright")
160
+ return {} unless copyright_node
161
+
162
+ {
163
+ holder: copyright_node.attributes["holder"],
164
+ year: copyright_node.attributes["year"]
165
+ }.compact
166
+ end
167
+
168
+ def parse_series
169
+ series_node = xpath_first("series")
170
+ return {} unless series_node
171
+
172
+ {
173
+ name: series_node.attributes["series.name"],
174
+ part: series_node.attributes["series.part"]&.to_i,
175
+ total: series_node.attributes["series.totalpart"]&.to_i
176
+ }.compact
177
+ end
178
+
179
+ def parse_management_status
180
+ status_node = xpath_first("ed-msg")
181
+ return {} unless status_node
182
+
183
+ {
184
+ info: status_node.attributes["info"],
185
+ message_type: status_node.attributes["msg-type"]
186
+ }.compact
187
+ end
188
+
189
+ def parse_identified_content
190
+ id_node = xpath_first("identified-content")
191
+ return {} unless id_node
192
+
193
+ {
194
+ subjects: REXML::XPath.match(id_node, "classifier[@type='subject']").map { |c| c.text&.strip }.compact,
195
+ locations: REXML::XPath.match(id_node, "location").map { |l| l.text&.strip }.compact,
196
+ organizations: REXML::XPath.match(id_node, "org").map { |o| o.text&.strip }.compact,
197
+ people: REXML::XPath.match(id_node, "person").map { |p| p.text&.strip }.compact
198
+ }
199
+ end
200
+ end
201
+ end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents a parsed NITF document
5
+ #
6
+ # The Document class is the main entry point for working with NITF content.
7
+ # It provides access to all parts of the NITF structure including head and body.
8
+ #
9
+ # @note This parser does not process external entities (DTD references) for security.
10
+ # REXML by default does not expand external entities, which protects against XXE attacks.
11
+ class Document
12
+ attr_reader :xml_doc, :head, :body
13
+
14
+ # Create a new Document from an NITF XML string
15
+ #
16
+ # @param xml [String] the NITF XML content
17
+ # @raise [ParseError] if the XML is malformed
18
+ # @raise [InvalidDocumentError] if the document is not valid NITF
19
+ def initialize(xml)
20
+ @xml_doc = parse_xml(xml)
21
+ validate_nitf!
22
+ parse_content
23
+ end
24
+
25
+ # Get the document's title from the head section
26
+ #
27
+ # @return [String, nil] the document title
28
+ def title
29
+ head&.title
30
+ end
31
+
32
+ # Get the main headline text
33
+ #
34
+ # @return [String, nil] the primary headline
35
+ def headline
36
+ body&.headline&.primary
37
+ end
38
+
39
+ # Get all headline levels
40
+ #
41
+ # @return [Headline, nil] the headline object with all levels
42
+ def headlines
43
+ body&.headline
44
+ end
45
+
46
+ # Get the byline information
47
+ #
48
+ # @return [Byline, nil] the byline object
49
+ def byline
50
+ body&.byline
51
+ end
52
+
53
+ # Get all paragraphs from the body content
54
+ #
55
+ # @return [Array<Paragraph>] array of paragraph objects
56
+ def paragraphs
57
+ body&.paragraphs || []
58
+ end
59
+
60
+ # Get the full text content of the article
61
+ #
62
+ # @return [String] concatenated paragraph text
63
+ def text
64
+ @text ||= paragraphs.map(&:text).join("\n\n")
65
+ end
66
+
67
+ # Get all media objects (images, etc.) from the document
68
+ #
69
+ # @return [Array<Media>] array of media objects
70
+ def media
71
+ body&.media || []
72
+ end
73
+
74
+ # Get document metadata from docdata
75
+ #
76
+ # @return [Docdata, nil] the docdata object
77
+ def docdata
78
+ head&.docdata
79
+ end
80
+
81
+ # Get the document ID
82
+ #
83
+ # @return [String, nil] the document ID
84
+ def doc_id
85
+ docdata&.doc_id
86
+ end
87
+
88
+ # Get the issue date
89
+ #
90
+ # @return [Date, nil] the issue date
91
+ def issue_date
92
+ docdata&.issue_date
93
+ end
94
+
95
+ # Get the NITF version from the root element
96
+ #
97
+ # @return [String, nil] the NITF version
98
+ def version
99
+ nitf_root.attributes["version"]
100
+ end
101
+
102
+ # Get the change date from the root element
103
+ #
104
+ # @return [String, nil] the change date
105
+ def change_date
106
+ nitf_root.attributes["change.date"]
107
+ end
108
+
109
+ # Get the change time from the root element
110
+ #
111
+ # @return [String, nil] the change time
112
+ def change_time
113
+ nitf_root.attributes["change.time"]
114
+ end
115
+
116
+ # Check if this is a valid NITF document
117
+ #
118
+ # @return [Boolean] true if valid NITF
119
+ def valid?
120
+ !nitf_root.nil?
121
+ end
122
+
123
+ # Return raw XML string
124
+ #
125
+ # @return [String] the original XML
126
+ def to_xml
127
+ @xml_doc.to_s
128
+ end
129
+
130
+ private
131
+
132
+ # Parse XML string into REXML document
133
+ #
134
+ # REXML does not expand external entities by default, which protects against:
135
+ # - XXE (XML External Entity) attacks
136
+ # - Billion Laughs (entity expansion) attacks
137
+ #
138
+ # Security settings are configured at module load time in lib/nitfr.rb
139
+ #
140
+ # @param xml [String] the XML content
141
+ # @return [REXML::Document] the parsed document
142
+ def parse_xml(xml)
143
+ REXML::Document.new(xml)
144
+ rescue REXML::ParseException => e
145
+ raise ParseError, "Failed to parse XML: #{e.message}"
146
+ end
147
+
148
+ def validate_nitf!
149
+ return if nitf_root
150
+
151
+ raise InvalidDocumentError, "Document does not appear to be valid NITF (missing <nitf> root element)"
152
+ end
153
+
154
+ def nitf_root
155
+ @nitf_root ||= begin
156
+ # Use direct root access for better performance when nitf is the root element
157
+ root = @xml_doc.root
158
+ return root if root&.name == "nitf"
159
+
160
+ # Fall back to XPath search for nested nitf elements
161
+ REXML::XPath.first(@xml_doc, "//nitf")
162
+ end
163
+ end
164
+
165
+ def parse_content
166
+ head_node = REXML::XPath.first(nitf_root, "head")
167
+ body_node = REXML::XPath.first(nitf_root, "body")
168
+
169
+ @head = Head.new(head_node) if head_node
170
+ @body = Body.new(body_node) if body_node
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Base error class for NITFr
5
+ class Error < StandardError; end
6
+
7
+ # Raised when XML parsing fails
8
+ class ParseError < Error; end
9
+
10
+ # Raised when the document is not valid NITF
11
+ class InvalidDocumentError < Error; end
12
+ end
data/lib/nitfr/head.rb ADDED
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents the head section of an NITF document
5
+ #
6
+ # The head contains metadata about the document including title,
7
+ # document data, publication information, and revision history.
8
+ class Head
9
+ attr_reader :node
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ # Get the document title
16
+ #
17
+ # @return [String, nil] the title text
18
+ def title
19
+ @title ||= xpath_first("title")&.text&.strip
20
+ end
21
+
22
+ # Get document metadata
23
+ #
24
+ # @return [Docdata, nil] the docdata object
25
+ def docdata
26
+ @docdata ||= begin
27
+ docdata_node = xpath_first("docdata")
28
+ Docdata.new(docdata_node) if docdata_node
29
+ end
30
+ end
31
+
32
+ # Get publication data
33
+ #
34
+ # @return [Hash] publication metadata
35
+ def pubdata
36
+ @pubdata ||= parse_pubdata
37
+ end
38
+
39
+ # Get revision history
40
+ #
41
+ # @return [Array<Hash>] array of revision entries
42
+ def revision_history
43
+ @revision_history ||= parse_revision_history
44
+ end
45
+
46
+ # Get metadata keywords
47
+ #
48
+ # @return [Array<String>] array of keywords
49
+ def keywords
50
+ @keywords ||= xpath_match("meta[@name='keywords']").map { |n| n.attributes["content"] }.compact
51
+ end
52
+
53
+ # Get all meta tags as a hash
54
+ #
55
+ # @return [Hash<String, String>] meta name => content pairs
56
+ def meta
57
+ @meta ||= xpath_match("meta").each_with_object({}) do |n, hash|
58
+ name = n.attributes["name"]
59
+ hash[name] = n.attributes["content"] if name
60
+ end
61
+ end
62
+
63
+ private
64
+
65
+ def xpath_first(path)
66
+ REXML::XPath.first(node, path)
67
+ end
68
+
69
+ def xpath_match(path)
70
+ REXML::XPath.match(node, path)
71
+ end
72
+
73
+ def parse_pubdata
74
+ pubdata_node = xpath_first("pubdata")
75
+ return {} unless pubdata_node
76
+
77
+ {
78
+ type: pubdata_node.attributes["type"],
79
+ date_publication: pubdata_node.attributes["date.publication"],
80
+ name: pubdata_node.attributes["name"],
81
+ issn: pubdata_node.attributes["issn"],
82
+ volume: pubdata_node.attributes["volume"],
83
+ number: pubdata_node.attributes["number"],
84
+ edition: pubdata_node.attributes["edition.name"],
85
+ position_section: pubdata_node.attributes["position.section"],
86
+ position_sequence: pubdata_node.attributes["position.sequence"]
87
+ }.compact
88
+ end
89
+
90
+ def parse_revision_history
91
+ xpath_match("revision-history").map do |rev|
92
+ {
93
+ comment: rev.attributes["comment"],
94
+ name: rev.attributes["name"],
95
+ function: rev.attributes["function"],
96
+ norm: rev.attributes["norm"]
97
+ }.compact
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents headline information from an NITF document
5
+ #
6
+ # NITF supports multiple headline levels (hl1, hl2) as well as
7
+ # headline (alternate headline) elements.
8
+ class Headline
9
+ attr_reader :node
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ # Get the primary headline (hl1)
16
+ #
17
+ # @return [String, nil] the main headline text
18
+ def primary
19
+ @primary ||= xpath_first("hl1")&.text&.strip
20
+ end
21
+ alias hl1 primary
22
+
23
+ # Get the secondary headline (hl2)
24
+ #
25
+ # @return [String, nil] the secondary headline text
26
+ def secondary
27
+ @secondary ||= xpath_first("hl2")&.text&.strip
28
+ end
29
+ alias hl2 secondary
30
+
31
+ # Get all headline levels as an array
32
+ #
33
+ # @return [Array<String>] array of headline texts in order
34
+ def all
35
+ @all ||= [primary, secondary].compact
36
+ end
37
+
38
+ # Get the full headline text (all levels joined)
39
+ #
40
+ # @return [String] combined headline text
41
+ def to_s
42
+ all.join(" - ")
43
+ end
44
+
45
+ # Check if headline exists
46
+ #
47
+ # @return [Boolean] true if any headline text exists
48
+ def present?
49
+ !primary.nil? || !secondary.nil?
50
+ end
51
+
52
+ private
53
+
54
+ def xpath_first(path)
55
+ REXML::XPath.first(node, path)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents a media element from an NITF document
5
+ #
6
+ # Media elements can represent images, audio, video, or other
7
+ # multimedia content embedded in the article.
8
+ class Media
9
+ attr_reader :node
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ # Get the media type (image, audio, video, etc.)
16
+ #
17
+ # @return [String, nil] the media type
18
+ def type
19
+ node.attributes["media-type"]
20
+ end
21
+
22
+ # Check if this is an image
23
+ #
24
+ # @return [Boolean] true if media type is image
25
+ def image?
26
+ type == "image"
27
+ end
28
+
29
+ # Check if this is audio
30
+ #
31
+ # @return [Boolean] true if media type is audio
32
+ def audio?
33
+ type == "audio"
34
+ end
35
+
36
+ # Check if this is video
37
+ #
38
+ # @return [Boolean] true if media type is video
39
+ def video?
40
+ type == "video"
41
+ end
42
+
43
+ # Get the caption text
44
+ #
45
+ # @return [String, nil] the caption
46
+ def caption
47
+ @caption ||= xpath_first("media-caption")&.text&.strip
48
+ end
49
+
50
+ # Get the producer/credit information
51
+ #
52
+ # @return [String, nil] the producer/credit
53
+ def producer
54
+ @producer ||= xpath_first("media-producer")&.text&.strip
55
+ end
56
+ alias credit producer
57
+
58
+ # Get all media references (different formats/sizes)
59
+ #
60
+ # @return [Array<Hash>] array of reference info
61
+ def references
62
+ @references ||= xpath_match("media-reference").map do |ref|
63
+ {
64
+ source: ref.attributes["source"],
65
+ mime_type: ref.attributes["mime-type"],
66
+ coding: ref.attributes["coding"],
67
+ width: ref.attributes["width"]&.to_i,
68
+ height: ref.attributes["height"]&.to_i,
69
+ alternate_text: ref.attributes["alternate-text"],
70
+ name: ref.attributes["name"]
71
+ }.compact
72
+ end
73
+ end
74
+
75
+ # Get the primary/first reference
76
+ #
77
+ # @return [Hash, nil] the first reference
78
+ def primary_reference
79
+ references.first
80
+ end
81
+
82
+ # Get the source URL of the primary reference
83
+ #
84
+ # @return [String, nil] the source URL
85
+ def source
86
+ primary_reference&.dig(:source)
87
+ end
88
+ alias src source
89
+ alias url source
90
+
91
+ # Get the mime type of the primary reference
92
+ #
93
+ # @return [String, nil] the mime type
94
+ def mime_type
95
+ primary_reference&.dig(:mime_type)
96
+ end
97
+
98
+ # Get the alternate text
99
+ #
100
+ # @return [String, nil] the alt text
101
+ def alt_text
102
+ primary_reference&.dig(:alternate_text)
103
+ end
104
+
105
+ # Get the width
106
+ #
107
+ # @return [Integer, nil] width in pixels
108
+ def width
109
+ primary_reference&.dig(:width)
110
+ end
111
+
112
+ # Get the height
113
+ #
114
+ # @return [Integer, nil] height in pixels
115
+ def height
116
+ primary_reference&.dig(:height)
117
+ end
118
+
119
+ # Get media metadata
120
+ #
121
+ # @return [Hash] additional metadata attributes
122
+ def metadata
123
+ @metadata ||= {
124
+ id: node.attributes["id"],
125
+ class: node.attributes["class"]
126
+ }.compact
127
+ end
128
+
129
+ private
130
+
131
+ def xpath_first(path)
132
+ REXML::XPath.first(node, path)
133
+ end
134
+
135
+ def xpath_match(path)
136
+ REXML::XPath.match(node, path)
137
+ end
138
+ end
139
+ end