nitfr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +330 -0
- data/Rakefile +12 -0
- data/lib/nitfr/body.rb +191 -0
- data/lib/nitfr/byline.rb +66 -0
- data/lib/nitfr/docdata.rb +201 -0
- data/lib/nitfr/document.rb +173 -0
- data/lib/nitfr/errors.rb +12 -0
- data/lib/nitfr/head.rb +101 -0
- data/lib/nitfr/headline.rb +58 -0
- data/lib/nitfr/media.rb +139 -0
- data/lib/nitfr/paragraph.rb +162 -0
- data/lib/nitfr/text_extractor.rb +26 -0
- data/lib/nitfr/version.rb +5 -0
- data/lib/nitfr.rb +48 -0
- metadata +101 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "date"
|
|
4
|
+
|
|
5
|
+
module NITFr
|
|
6
|
+
# Represents the docdata section of an NITF document head
|
|
7
|
+
#
|
|
8
|
+
# Docdata contains document metadata including IDs, dates,
|
|
9
|
+
# urgency, and other management information.
|
|
10
|
+
class Docdata
|
|
11
|
+
attr_reader :node
|
|
12
|
+
|
|
13
|
+
def initialize(node)
|
|
14
|
+
@node = node
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Get the document ID
|
|
18
|
+
#
|
|
19
|
+
# @return [String, nil] the doc-id value
|
|
20
|
+
def doc_id
|
|
21
|
+
@doc_id ||= xpath_first("doc-id")&.attributes&.[]("id-string")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Get the issue date
|
|
25
|
+
#
|
|
26
|
+
# @return [Date, nil] the parsed issue date
|
|
27
|
+
def issue_date
|
|
28
|
+
@issue_date ||= parse_date("date.issue")
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Get the release date
|
|
32
|
+
#
|
|
33
|
+
# @return [Date, nil] the parsed release date
|
|
34
|
+
def release_date
|
|
35
|
+
@release_date ||= parse_date("date.release")
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get the expiration date
|
|
39
|
+
#
|
|
40
|
+
# @return [Date, nil] the parsed expire date
|
|
41
|
+
def expire_date
|
|
42
|
+
@expire_date ||= parse_date("date.expire")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Get urgency level (1-8, 1 being most urgent)
|
|
46
|
+
#
|
|
47
|
+
# @return [Integer, nil] the urgency value
|
|
48
|
+
def urgency
|
|
49
|
+
@urgency ||= xpath_first("urgency")&.attributes&.[]("ed-urg")&.to_i
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Get the document copyright information
|
|
53
|
+
#
|
|
54
|
+
# @return [Hash] copyright details
|
|
55
|
+
def copyright
|
|
56
|
+
@copyright ||= parse_copyright
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Get the copyright holder
|
|
60
|
+
#
|
|
61
|
+
# @return [String, nil] the copyright holder
|
|
62
|
+
def copyright_holder
|
|
63
|
+
copyright[:holder]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Get the copyright year
|
|
67
|
+
#
|
|
68
|
+
# @return [String, nil] the copyright year
|
|
69
|
+
def copyright_year
|
|
70
|
+
copyright[:year]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get document scope information
|
|
74
|
+
#
|
|
75
|
+
# @return [String, nil] the doc-scope
|
|
76
|
+
def doc_scope
|
|
77
|
+
@doc_scope ||= xpath_first("doc-scope")&.attributes&.[]("scope")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get series information
|
|
81
|
+
#
|
|
82
|
+
# @return [Hash] series details
|
|
83
|
+
def series
|
|
84
|
+
@series ||= parse_series
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Get editorial status/management info
|
|
88
|
+
#
|
|
89
|
+
# @return [Hash] management status
|
|
90
|
+
def management_status
|
|
91
|
+
@management_status ||= parse_management_status
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Get the fixture identifier
|
|
95
|
+
#
|
|
96
|
+
# @return [String, nil] the fixture value
|
|
97
|
+
def fixture
|
|
98
|
+
@fixture ||= xpath_first("fixture")&.attributes&.[]("fix-id")
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Get all identified content (subjects, organizations, people, etc.)
|
|
102
|
+
#
|
|
103
|
+
# @return [Hash] categorized identified content
|
|
104
|
+
def identified_content
|
|
105
|
+
@identified_content ||= parse_identified_content
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get subject codes/topics
|
|
109
|
+
#
|
|
110
|
+
# @return [Array<String>] array of subjects
|
|
111
|
+
def subjects
|
|
112
|
+
identified_content[:subjects] || []
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Get location codes
|
|
116
|
+
#
|
|
117
|
+
# @return [Array<String>] array of locations
|
|
118
|
+
def locations
|
|
119
|
+
identified_content[:locations] || []
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Get organization codes
|
|
123
|
+
#
|
|
124
|
+
# @return [Array<String>] array of organizations
|
|
125
|
+
def organizations
|
|
126
|
+
identified_content[:organizations] || []
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Get person codes
|
|
130
|
+
#
|
|
131
|
+
# @return [Array<String>] array of people
|
|
132
|
+
def people
|
|
133
|
+
identified_content[:people] || []
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
private
|
|
137
|
+
|
|
138
|
+
def xpath_first(path)
|
|
139
|
+
REXML::XPath.first(node, path)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def xpath_match(path)
|
|
143
|
+
REXML::XPath.match(node, path)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def parse_date(element_name)
|
|
147
|
+
date_node = xpath_first(element_name)
|
|
148
|
+
return nil unless date_node
|
|
149
|
+
|
|
150
|
+
norm = date_node.attributes["norm"]
|
|
151
|
+
return nil unless norm
|
|
152
|
+
|
|
153
|
+
Date.parse(norm)
|
|
154
|
+
rescue Date::Error
|
|
155
|
+
nil
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def parse_copyright
|
|
159
|
+
copyright_node = xpath_first("doc.copyright")
|
|
160
|
+
return {} unless copyright_node
|
|
161
|
+
|
|
162
|
+
{
|
|
163
|
+
holder: copyright_node.attributes["holder"],
|
|
164
|
+
year: copyright_node.attributes["year"]
|
|
165
|
+
}.compact
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def parse_series
|
|
169
|
+
series_node = xpath_first("series")
|
|
170
|
+
return {} unless series_node
|
|
171
|
+
|
|
172
|
+
{
|
|
173
|
+
name: series_node.attributes["series.name"],
|
|
174
|
+
part: series_node.attributes["series.part"]&.to_i,
|
|
175
|
+
total: series_node.attributes["series.totalpart"]&.to_i
|
|
176
|
+
}.compact
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def parse_management_status
|
|
180
|
+
status_node = xpath_first("ed-msg")
|
|
181
|
+
return {} unless status_node
|
|
182
|
+
|
|
183
|
+
{
|
|
184
|
+
info: status_node.attributes["info"],
|
|
185
|
+
message_type: status_node.attributes["msg-type"]
|
|
186
|
+
}.compact
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def parse_identified_content
|
|
190
|
+
id_node = xpath_first("identified-content")
|
|
191
|
+
return {} unless id_node
|
|
192
|
+
|
|
193
|
+
{
|
|
194
|
+
subjects: REXML::XPath.match(id_node, "classifier[@type='subject']").map { |c| c.text&.strip }.compact,
|
|
195
|
+
locations: REXML::XPath.match(id_node, "location").map { |l| l.text&.strip }.compact,
|
|
196
|
+
organizations: REXML::XPath.match(id_node, "org").map { |o| o.text&.strip }.compact,
|
|
197
|
+
people: REXML::XPath.match(id_node, "person").map { |p| p.text&.strip }.compact
|
|
198
|
+
}
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents a parsed NITF document
|
|
5
|
+
#
|
|
6
|
+
# The Document class is the main entry point for working with NITF content.
|
|
7
|
+
# It provides access to all parts of the NITF structure including head and body.
|
|
8
|
+
#
|
|
9
|
+
# @note This parser does not process external entities (DTD references) for security.
|
|
10
|
+
# REXML by default does not expand external entities, which protects against XXE attacks.
|
|
11
|
+
class Document
|
|
12
|
+
attr_reader :xml_doc, :head, :body
|
|
13
|
+
|
|
14
|
+
# Create a new Document from an NITF XML string
|
|
15
|
+
#
|
|
16
|
+
# @param xml [String] the NITF XML content
|
|
17
|
+
# @raise [ParseError] if the XML is malformed
|
|
18
|
+
# @raise [InvalidDocumentError] if the document is not valid NITF
|
|
19
|
+
def initialize(xml)
|
|
20
|
+
@xml_doc = parse_xml(xml)
|
|
21
|
+
validate_nitf!
|
|
22
|
+
parse_content
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Get the document's title from the head section
|
|
26
|
+
#
|
|
27
|
+
# @return [String, nil] the document title
|
|
28
|
+
def title
|
|
29
|
+
head&.title
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get the main headline text
|
|
33
|
+
#
|
|
34
|
+
# @return [String, nil] the primary headline
|
|
35
|
+
def headline
|
|
36
|
+
body&.headline&.primary
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get all headline levels
|
|
40
|
+
#
|
|
41
|
+
# @return [Headline, nil] the headline object with all levels
|
|
42
|
+
def headlines
|
|
43
|
+
body&.headline
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get the byline information
|
|
47
|
+
#
|
|
48
|
+
# @return [Byline, nil] the byline object
|
|
49
|
+
def byline
|
|
50
|
+
body&.byline
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Get all paragraphs from the body content
|
|
54
|
+
#
|
|
55
|
+
# @return [Array<Paragraph>] array of paragraph objects
|
|
56
|
+
def paragraphs
|
|
57
|
+
body&.paragraphs || []
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get the full text content of the article
|
|
61
|
+
#
|
|
62
|
+
# @return [String] concatenated paragraph text
|
|
63
|
+
def text
|
|
64
|
+
@text ||= paragraphs.map(&:text).join("\n\n")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Get all media objects (images, etc.) from the document
|
|
68
|
+
#
|
|
69
|
+
# @return [Array<Media>] array of media objects
|
|
70
|
+
def media
|
|
71
|
+
body&.media || []
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get document metadata from docdata
|
|
75
|
+
#
|
|
76
|
+
# @return [Docdata, nil] the docdata object
|
|
77
|
+
def docdata
|
|
78
|
+
head&.docdata
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get the document ID
|
|
82
|
+
#
|
|
83
|
+
# @return [String, nil] the document ID
|
|
84
|
+
def doc_id
|
|
85
|
+
docdata&.doc_id
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Get the issue date
|
|
89
|
+
#
|
|
90
|
+
# @return [Date, nil] the issue date
|
|
91
|
+
def issue_date
|
|
92
|
+
docdata&.issue_date
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Get the NITF version from the root element
|
|
96
|
+
#
|
|
97
|
+
# @return [String, nil] the NITF version
|
|
98
|
+
def version
|
|
99
|
+
nitf_root.attributes["version"]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Get the change date from the root element
|
|
103
|
+
#
|
|
104
|
+
# @return [String, nil] the change date
|
|
105
|
+
def change_date
|
|
106
|
+
nitf_root.attributes["change.date"]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Get the change time from the root element
|
|
110
|
+
#
|
|
111
|
+
# @return [String, nil] the change time
|
|
112
|
+
def change_time
|
|
113
|
+
nitf_root.attributes["change.time"]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Check if this is a valid NITF document
|
|
117
|
+
#
|
|
118
|
+
# @return [Boolean] true if valid NITF
|
|
119
|
+
def valid?
|
|
120
|
+
!nitf_root.nil?
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Return raw XML string
|
|
124
|
+
#
|
|
125
|
+
# @return [String] the original XML
|
|
126
|
+
def to_xml
|
|
127
|
+
@xml_doc.to_s
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
# Parse XML string into REXML document
|
|
133
|
+
#
|
|
134
|
+
# REXML does not expand external entities by default, which protects against:
|
|
135
|
+
# - XXE (XML External Entity) attacks
|
|
136
|
+
# - Billion Laughs (entity expansion) attacks
|
|
137
|
+
#
|
|
138
|
+
# Security settings are configured at module load time in lib/nitfr.rb
|
|
139
|
+
#
|
|
140
|
+
# @param xml [String] the XML content
|
|
141
|
+
# @return [REXML::Document] the parsed document
|
|
142
|
+
def parse_xml(xml)
|
|
143
|
+
REXML::Document.new(xml)
|
|
144
|
+
rescue REXML::ParseException => e
|
|
145
|
+
raise ParseError, "Failed to parse XML: #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def validate_nitf!
|
|
149
|
+
return if nitf_root
|
|
150
|
+
|
|
151
|
+
raise InvalidDocumentError, "Document does not appear to be valid NITF (missing <nitf> root element)"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def nitf_root
|
|
155
|
+
@nitf_root ||= begin
|
|
156
|
+
# Use direct root access for better performance when nitf is the root element
|
|
157
|
+
root = @xml_doc.root
|
|
158
|
+
return root if root&.name == "nitf"
|
|
159
|
+
|
|
160
|
+
# Fall back to XPath search for nested nitf elements
|
|
161
|
+
REXML::XPath.first(@xml_doc, "//nitf")
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def parse_content
|
|
166
|
+
head_node = REXML::XPath.first(nitf_root, "head")
|
|
167
|
+
body_node = REXML::XPath.first(nitf_root, "body")
|
|
168
|
+
|
|
169
|
+
@head = Head.new(head_node) if head_node
|
|
170
|
+
@body = Body.new(body_node) if body_node
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
data/lib/nitfr/errors.rb
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Base error class for NITFr
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when XML parsing fails
|
|
8
|
+
class ParseError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when the document is not valid NITF
|
|
11
|
+
class InvalidDocumentError < Error; end
|
|
12
|
+
end
|
data/lib/nitfr/head.rb
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents the head section of an NITF document
|
|
5
|
+
#
|
|
6
|
+
# The head contains metadata about the document including title,
|
|
7
|
+
# document data, publication information, and revision history.
|
|
8
|
+
class Head
|
|
9
|
+
attr_reader :node
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Get the document title
|
|
16
|
+
#
|
|
17
|
+
# @return [String, nil] the title text
|
|
18
|
+
def title
|
|
19
|
+
@title ||= xpath_first("title")&.text&.strip
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Get document metadata
|
|
23
|
+
#
|
|
24
|
+
# @return [Docdata, nil] the docdata object
|
|
25
|
+
def docdata
|
|
26
|
+
@docdata ||= begin
|
|
27
|
+
docdata_node = xpath_first("docdata")
|
|
28
|
+
Docdata.new(docdata_node) if docdata_node
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get publication data
|
|
33
|
+
#
|
|
34
|
+
# @return [Hash] publication metadata
|
|
35
|
+
def pubdata
|
|
36
|
+
@pubdata ||= parse_pubdata
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get revision history
|
|
40
|
+
#
|
|
41
|
+
# @return [Array<Hash>] array of revision entries
|
|
42
|
+
def revision_history
|
|
43
|
+
@revision_history ||= parse_revision_history
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get metadata keywords
|
|
47
|
+
#
|
|
48
|
+
# @return [Array<String>] array of keywords
|
|
49
|
+
def keywords
|
|
50
|
+
@keywords ||= xpath_match("meta[@name='keywords']").map { |n| n.attributes["content"] }.compact
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Get all meta tags as a hash
|
|
54
|
+
#
|
|
55
|
+
# @return [Hash<String, String>] meta name => content pairs
|
|
56
|
+
def meta
|
|
57
|
+
@meta ||= xpath_match("meta").each_with_object({}) do |n, hash|
|
|
58
|
+
name = n.attributes["name"]
|
|
59
|
+
hash[name] = n.attributes["content"] if name
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def xpath_first(path)
|
|
66
|
+
REXML::XPath.first(node, path)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def xpath_match(path)
|
|
70
|
+
REXML::XPath.match(node, path)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def parse_pubdata
|
|
74
|
+
pubdata_node = xpath_first("pubdata")
|
|
75
|
+
return {} unless pubdata_node
|
|
76
|
+
|
|
77
|
+
{
|
|
78
|
+
type: pubdata_node.attributes["type"],
|
|
79
|
+
date_publication: pubdata_node.attributes["date.publication"],
|
|
80
|
+
name: pubdata_node.attributes["name"],
|
|
81
|
+
issn: pubdata_node.attributes["issn"],
|
|
82
|
+
volume: pubdata_node.attributes["volume"],
|
|
83
|
+
number: pubdata_node.attributes["number"],
|
|
84
|
+
edition: pubdata_node.attributes["edition.name"],
|
|
85
|
+
position_section: pubdata_node.attributes["position.section"],
|
|
86
|
+
position_sequence: pubdata_node.attributes["position.sequence"]
|
|
87
|
+
}.compact
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def parse_revision_history
|
|
91
|
+
xpath_match("revision-history").map do |rev|
|
|
92
|
+
{
|
|
93
|
+
comment: rev.attributes["comment"],
|
|
94
|
+
name: rev.attributes["name"],
|
|
95
|
+
function: rev.attributes["function"],
|
|
96
|
+
norm: rev.attributes["norm"]
|
|
97
|
+
}.compact
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents headline information from an NITF document
|
|
5
|
+
#
|
|
6
|
+
# NITF supports multiple headline levels (hl1, hl2) as well as
|
|
7
|
+
# headline (alternate headline) elements.
|
|
8
|
+
class Headline
|
|
9
|
+
attr_reader :node
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Get the primary headline (hl1)
|
|
16
|
+
#
|
|
17
|
+
# @return [String, nil] the main headline text
|
|
18
|
+
def primary
|
|
19
|
+
@primary ||= xpath_first("hl1")&.text&.strip
|
|
20
|
+
end
|
|
21
|
+
alias hl1 primary
|
|
22
|
+
|
|
23
|
+
# Get the secondary headline (hl2)
|
|
24
|
+
#
|
|
25
|
+
# @return [String, nil] the secondary headline text
|
|
26
|
+
def secondary
|
|
27
|
+
@secondary ||= xpath_first("hl2")&.text&.strip
|
|
28
|
+
end
|
|
29
|
+
alias hl2 secondary
|
|
30
|
+
|
|
31
|
+
# Get all headline levels as an array
|
|
32
|
+
#
|
|
33
|
+
# @return [Array<String>] array of headline texts in order
|
|
34
|
+
def all
|
|
35
|
+
@all ||= [primary, secondary].compact
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get the full headline text (all levels joined)
|
|
39
|
+
#
|
|
40
|
+
# @return [String] combined headline text
|
|
41
|
+
def to_s
|
|
42
|
+
all.join(" - ")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Check if headline exists
|
|
46
|
+
#
|
|
47
|
+
# @return [Boolean] true if any headline text exists
|
|
48
|
+
def present?
|
|
49
|
+
!primary.nil? || !secondary.nil?
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def xpath_first(path)
|
|
55
|
+
REXML::XPath.first(node, path)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
data/lib/nitfr/media.rb
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents a media element from an NITF document
|
|
5
|
+
#
|
|
6
|
+
# Media elements can represent images, audio, video, or other
|
|
7
|
+
# multimedia content embedded in the article.
|
|
8
|
+
class Media
|
|
9
|
+
attr_reader :node
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Get the media type (image, audio, video, etc.)
|
|
16
|
+
#
|
|
17
|
+
# @return [String, nil] the media type
|
|
18
|
+
def type
|
|
19
|
+
node.attributes["media-type"]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Check if this is an image
|
|
23
|
+
#
|
|
24
|
+
# @return [Boolean] true if media type is image
|
|
25
|
+
def image?
|
|
26
|
+
type == "image"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Check if this is audio
|
|
30
|
+
#
|
|
31
|
+
# @return [Boolean] true if media type is audio
|
|
32
|
+
def audio?
|
|
33
|
+
type == "audio"
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Check if this is video
|
|
37
|
+
#
|
|
38
|
+
# @return [Boolean] true if media type is video
|
|
39
|
+
def video?
|
|
40
|
+
type == "video"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Get the caption text
|
|
44
|
+
#
|
|
45
|
+
# @return [String, nil] the caption
|
|
46
|
+
def caption
|
|
47
|
+
@caption ||= xpath_first("media-caption")&.text&.strip
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Get the producer/credit information
|
|
51
|
+
#
|
|
52
|
+
# @return [String, nil] the producer/credit
|
|
53
|
+
def producer
|
|
54
|
+
@producer ||= xpath_first("media-producer")&.text&.strip
|
|
55
|
+
end
|
|
56
|
+
alias credit producer
|
|
57
|
+
|
|
58
|
+
# Get all media references (different formats/sizes)
|
|
59
|
+
#
|
|
60
|
+
# @return [Array<Hash>] array of reference info
|
|
61
|
+
def references
|
|
62
|
+
@references ||= xpath_match("media-reference").map do |ref|
|
|
63
|
+
{
|
|
64
|
+
source: ref.attributes["source"],
|
|
65
|
+
mime_type: ref.attributes["mime-type"],
|
|
66
|
+
coding: ref.attributes["coding"],
|
|
67
|
+
width: ref.attributes["width"]&.to_i,
|
|
68
|
+
height: ref.attributes["height"]&.to_i,
|
|
69
|
+
alternate_text: ref.attributes["alternate-text"],
|
|
70
|
+
name: ref.attributes["name"]
|
|
71
|
+
}.compact
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Get the primary/first reference
|
|
76
|
+
#
|
|
77
|
+
# @return [Hash, nil] the first reference
|
|
78
|
+
def primary_reference
|
|
79
|
+
references.first
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Get the source URL of the primary reference
|
|
83
|
+
#
|
|
84
|
+
# @return [String, nil] the source URL
|
|
85
|
+
def source
|
|
86
|
+
primary_reference&.dig(:source)
|
|
87
|
+
end
|
|
88
|
+
alias src source
|
|
89
|
+
alias url source
|
|
90
|
+
|
|
91
|
+
# Get the mime type of the primary reference
|
|
92
|
+
#
|
|
93
|
+
# @return [String, nil] the mime type
|
|
94
|
+
def mime_type
|
|
95
|
+
primary_reference&.dig(:mime_type)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Get the alternate text
|
|
99
|
+
#
|
|
100
|
+
# @return [String, nil] the alt text
|
|
101
|
+
def alt_text
|
|
102
|
+
primary_reference&.dig(:alternate_text)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Get the width
|
|
106
|
+
#
|
|
107
|
+
# @return [Integer, nil] width in pixels
|
|
108
|
+
def width
|
|
109
|
+
primary_reference&.dig(:width)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Get the height
|
|
113
|
+
#
|
|
114
|
+
# @return [Integer, nil] height in pixels
|
|
115
|
+
def height
|
|
116
|
+
primary_reference&.dig(:height)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Get media metadata
|
|
120
|
+
#
|
|
121
|
+
# @return [Hash] additional metadata attributes
|
|
122
|
+
def metadata
|
|
123
|
+
@metadata ||= {
|
|
124
|
+
id: node.attributes["id"],
|
|
125
|
+
class: node.attributes["class"]
|
|
126
|
+
}.compact
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
private
|
|
130
|
+
|
|
131
|
+
def xpath_first(path)
|
|
132
|
+
REXML::XPath.first(node, path)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def xpath_match(path)
|
|
136
|
+
REXML::XPath.match(node, path)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|