nitfr 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Provides export functionality for NITF documents
5
+ #
6
+ # Supports conversion to Markdown, plain text, and HTML formats.
7
+ module Exporter
8
+ # Convert document to Markdown format
9
+ #
10
+ # @return [String] Markdown representation of the document
11
+ def to_markdown
12
+ lines = []
13
+
14
+ # Title/Headline
15
+ if headline
16
+ lines << "# #{headline}"
17
+ lines << ""
18
+ end
19
+
20
+ # Byline
21
+ if byline&.text
22
+ lines << "*#{byline.text}*"
23
+ lines << ""
24
+ end
25
+
26
+ # Dateline
27
+ if body&.dateline
28
+ lines << "**#{body.dateline}**"
29
+ lines << ""
30
+ end
31
+
32
+ # Abstract
33
+ if body&.abstract
34
+ lines << "> #{body.abstract}"
35
+ lines << ""
36
+ end
37
+
38
+ # Paragraphs
39
+ paragraphs.each do |para|
40
+ lines << format_paragraph_markdown(para)
41
+ lines << ""
42
+ end
43
+
44
+ # Block quotes
45
+ body&.block_quotes&.each do |quote|
46
+ lines << "> #{quote}"
47
+ lines << ""
48
+ end
49
+
50
+ # Footnotes
51
+ if footnotes.any?
52
+ lines << "---"
53
+ lines << ""
54
+ footnotes.each do |fn|
55
+ label = fn.label || "*"
56
+ lines << "[#{label}]: #{fn.value}"
57
+ end
58
+ lines << ""
59
+ end
60
+
61
+ lines.join("\n").strip
62
+ end
63
+
64
+ # Convert document to plain text format
65
+ #
66
+ # @return [String] plain text representation of the document
67
+ def to_text
68
+ lines = []
69
+
70
+ # Title/Headline
71
+ if headline
72
+ lines << headline.upcase
73
+ lines << "=" * headline.length
74
+ lines << ""
75
+ end
76
+
77
+ # Byline
78
+ if byline&.text
79
+ lines << byline.text
80
+ lines << ""
81
+ end
82
+
83
+ # Dateline
84
+ if body&.dateline
85
+ lines << body.dateline
86
+ lines << ""
87
+ end
88
+
89
+ # Paragraphs
90
+ paragraphs.each do |para|
91
+ lines << para.text
92
+ lines << ""
93
+ end
94
+
95
+ # Block quotes
96
+ body&.block_quotes&.each do |quote|
97
+ lines << " \"#{quote}\""
98
+ lines << ""
99
+ end
100
+
101
+ # Footnotes
102
+ if footnotes.any?
103
+ lines << "-" * 40
104
+ lines << ""
105
+ footnotes.each do |fn|
106
+ label = fn.label || "*"
107
+ lines << "[#{label}] #{fn.value}"
108
+ end
109
+ lines << ""
110
+ end
111
+
112
+ lines.join("\n").strip
113
+ end
114
+
115
+ # Convert document to HTML format
116
+ #
117
+ # @param include_wrapper [Boolean] whether to include html/body tags (default: false)
118
+ # @return [String] HTML representation of the document
119
+ def to_html(include_wrapper: false)
120
+ html_parts = []
121
+
122
+ # Article container
123
+ html_parts << "<article>"
124
+
125
+ # Header section
126
+ html_parts << " <header>"
127
+
128
+ if headline
129
+ html_parts << " <h1>#{escape_html(headline)}</h1>"
130
+ end
131
+
132
+ if byline&.text
133
+ html_parts << " <p class=\"byline\">#{escape_html(byline.text)}</p>"
134
+ end
135
+
136
+ if body&.dateline
137
+ html_parts << " <p class=\"dateline\">#{escape_html(body.dateline)}</p>"
138
+ end
139
+
140
+ html_parts << " </header>"
141
+
142
+ # Abstract
143
+ if body&.abstract
144
+ html_parts << " <aside class=\"abstract\">"
145
+ html_parts << " <p>#{escape_html(body.abstract)}</p>"
146
+ html_parts << " </aside>"
147
+ end
148
+
149
+ # Main content
150
+ html_parts << " <section class=\"content\">"
151
+
152
+ paragraphs.each do |para|
153
+ html_parts << format_paragraph_html(para)
154
+ end
155
+
156
+ # Block quotes
157
+ body&.block_quotes&.each do |quote|
158
+ html_parts << " <blockquote>"
159
+ html_parts << " <p>#{escape_html(quote)}</p>"
160
+ html_parts << " </blockquote>"
161
+ end
162
+
163
+ html_parts << " </section>"
164
+
165
+ # Footnotes
166
+ if footnotes.any?
167
+ html_parts << " <footer class=\"footnotes\">"
168
+ html_parts << " <ol>"
169
+ footnotes.each do |fn|
170
+ id_attr = fn.id ? " id=\"#{escape_html(fn.id)}\"" : ""
171
+ html_parts << " <li#{id_attr}>#{escape_html(fn.value)}</li>"
172
+ end
173
+ html_parts << " </ol>"
174
+ html_parts << " </footer>"
175
+ end
176
+
177
+ html_parts << "</article>"
178
+
179
+ content = html_parts.join("\n")
180
+
181
+ if include_wrapper
182
+ wrap_html(content)
183
+ else
184
+ content
185
+ end
186
+ end
187
+
188
+ private
189
+
190
+ def format_paragraph_markdown(para)
191
+ text = para.text
192
+
193
+ # Add emphasis markers
194
+ para.emphasis.each do |em|
195
+ text = text.gsub(em, "*#{em}*")
196
+ end
197
+
198
+ # Add strong markers
199
+ para.strong.each do |strong|
200
+ text = text.gsub(strong, "**#{strong}**")
201
+ end
202
+
203
+ text
204
+ end
205
+
206
+ def format_paragraph_html(para)
207
+ text = escape_html(para.text)
208
+
209
+ # Convert line breaks to <br>
210
+ text = text.gsub("\n", "<br>\n")
211
+
212
+ # Add emphasis tags
213
+ para.emphasis.each do |em|
214
+ escaped = escape_html(em)
215
+ text = text.gsub(escaped, "<em>#{escaped}</em>")
216
+ end
217
+
218
+ # Add strong tags
219
+ para.strong.each do |strong|
220
+ escaped = escape_html(strong)
221
+ text = text.gsub(escaped, "<strong>#{escaped}</strong>")
222
+ end
223
+
224
+ classes = []
225
+ classes << "lead" if para.lead?
226
+
227
+ class_attr = classes.any? ? " class=\"#{classes.join(' ')}\"" : ""
228
+ " <p#{class_attr}>#{text}</p>"
229
+ end
230
+
231
+ def escape_html(text)
232
+ return "" if text.nil?
233
+
234
+ text.to_s
235
+ .gsub("&", "&amp;")
236
+ .gsub("<", "&lt;")
237
+ .gsub(">", "&gt;")
238
+ .gsub('"', "&quot;")
239
+ end
240
+
241
+ def wrap_html(content)
242
+ <<~HTML
243
+ <!DOCTYPE html>
244
+ <html lang="en">
245
+ <head>
246
+ <meta charset="UTF-8">
247
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
248
+ <title>#{escape_html(title || headline || 'NITF Document')}</title>
249
+ </head>
250
+ <body>
251
+ #{content}
252
+ </body>
253
+ </html>
254
+ HTML
255
+ end
256
+ end
257
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Represents a footnote from an NITF document
5
+ #
6
+ # Footnotes can appear in body.content or body.end and contain
7
+ # a label (reference marker) and value (the footnote text).
8
+ class Footnote
9
+ attr_reader :node
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ # Get the footnote ID
16
+ #
17
+ # @return [String, nil] the footnote ID attribute
18
+ def id
19
+ node.attributes["id"]
20
+ end
21
+
22
+ # Get the footnote label (reference marker)
23
+ #
24
+ # @return [String, nil] the label text (e.g., "1", "*", "a")
25
+ def label
26
+ @label ||= xpath_text("fn-label")
27
+ end
28
+
29
+ # Get the footnote value (content)
30
+ #
31
+ # @return [String, nil] the footnote text content
32
+ def value
33
+ @value ||= xpath_text("fn-value")
34
+ end
35
+ alias text value
36
+ alias content value
37
+
38
+ # Check if footnote has content
39
+ #
40
+ # @return [Boolean] true if footnote has a value
41
+ def present?
42
+ !value.nil? && !value.empty?
43
+ end
44
+
45
+ # Convert footnote to a Hash representation
46
+ #
47
+ # @return [Hash] the footnote as a hash
48
+ def to_h
49
+ {
50
+ id: id,
51
+ label: label,
52
+ value: value
53
+ }.compact
54
+ end
55
+
56
+ private
57
+
58
+ def xpath_text(path)
59
+ element = REXML::XPath.first(node, path)
60
+ element&.text&.strip
61
+ end
62
+ end
63
+ end
data/lib/nitfr/head.rb CHANGED
@@ -60,6 +60,20 @@ module NITFr
60
60
  end
61
61
  end
62
62
 
63
+ # Convert head to a Hash representation
64
+ #
65
+ # @return [Hash] the head as a hash
66
+ def to_h
67
+ {
68
+ title: title,
69
+ meta: meta.empty? ? nil : meta,
70
+ keywords: keywords.empty? ? nil : keywords,
71
+ pubdata: pubdata.empty? ? nil : pubdata,
72
+ revision_history: revision_history.empty? ? nil : revision_history,
73
+ docdata: docdata&.to_h
74
+ }.compact
75
+ end
76
+
63
77
  private
64
78
 
65
79
  def xpath_first(path)
@@ -3,7 +3,7 @@
3
3
  module NITFr
4
4
  # Represents headline information from an NITF document
5
5
  #
6
- # NITF supports multiple headline levels (hl1, hl2) as well as
6
+ # NITF supports multiple headline levels (hl1 through hl5) as well as
7
7
  # headline (alternate headline) elements.
8
8
  class Headline
9
9
  attr_reader :node
@@ -28,11 +28,35 @@ module NITFr
28
28
  end
29
29
  alias hl2 secondary
30
30
 
31
+ # Get the tertiary headline (hl3)
32
+ #
33
+ # @return [String, nil] the tertiary headline text
34
+ def tertiary
35
+ @tertiary ||= xpath_first("hl3")&.text&.strip
36
+ end
37
+ alias hl3 tertiary
38
+
39
+ # Get the quaternary headline (hl4)
40
+ #
41
+ # @return [String, nil] the quaternary headline text
42
+ def quaternary
43
+ @quaternary ||= xpath_first("hl4")&.text&.strip
44
+ end
45
+ alias hl4 quaternary
46
+
47
+ # Get the quinary headline (hl5)
48
+ #
49
+ # @return [String, nil] the quinary headline text
50
+ def quinary
51
+ @quinary ||= xpath_first("hl5")&.text&.strip
52
+ end
53
+ alias hl5 quinary
54
+
31
55
  # Get all headline levels as an array
32
56
  #
33
57
  # @return [Array<String>] array of headline texts in order
34
58
  def all
35
- @all ||= [primary, secondary].compact
59
+ @all ||= [primary, secondary, tertiary, quaternary, quinary].compact
36
60
  end
37
61
 
38
62
  # Get the full headline text (all levels joined)
@@ -46,7 +70,20 @@ module NITFr
46
70
  #
47
71
  # @return [Boolean] true if any headline text exists
48
72
  def present?
49
- !primary.nil? || !secondary.nil?
73
+ all.any?
74
+ end
75
+
76
+ # Convert headline to a Hash representation
77
+ #
78
+ # @return [Hash] the headline as a hash
79
+ def to_h
80
+ {
81
+ primary: primary,
82
+ secondary: secondary,
83
+ tertiary: tertiary,
84
+ quaternary: quaternary,
85
+ quinary: quinary
86
+ }.compact
50
87
  end
51
88
 
52
89
  private
data/lib/nitfr/media.rb CHANGED
@@ -126,6 +126,24 @@ module NITFr
126
126
  }.compact
127
127
  end
128
128
 
129
+ # Convert media to a Hash representation
130
+ #
131
+ # @return [Hash] the media as a hash
132
+ def to_h
133
+ {
134
+ type: type,
135
+ source: source,
136
+ mime_type: mime_type,
137
+ width: width,
138
+ height: height,
139
+ alt_text: alt_text,
140
+ caption: caption,
141
+ credit: credit,
142
+ metadata: metadata.empty? ? nil : metadata,
143
+ references: references.size > 1 ? references : nil
144
+ }.compact
145
+ end
146
+
129
147
  private
130
148
 
131
149
  def xpath_first(path)
@@ -11,6 +11,7 @@ module NITFr
11
11
  # arrays on first access to any entity method.
12
12
  class Paragraph
13
13
  include TextExtractor
14
+ include SearchPattern
14
15
 
15
16
  attr_reader :node
16
17
 
@@ -48,7 +49,7 @@ module NITFr
48
49
  lede == "true" || lede == "yes"
49
50
  end
50
51
 
51
- # Get any emphasized text within the paragraph
52
+ # Get any emphasized text within the paragraph (em tags)
52
53
  #
53
54
  # @return [Array<String>] array of emphasized text
54
55
  def emphasis
@@ -56,6 +57,14 @@ module NITFr
56
57
  @emphasis
57
58
  end
58
59
 
60
+ # Get any strong/bold text within the paragraph (strong tags)
61
+ #
62
+ # @return [Array<String>] array of strong text
63
+ def strong
64
+ extract_entities unless @entities_extracted
65
+ @strong
66
+ end
67
+
59
68
  # Get any links within the paragraph
60
69
  #
61
70
  # @return [Array<Hash>] array of link info hashes
@@ -111,6 +120,107 @@ module NITFr
111
120
  text.split(/\s+/).size
112
121
  end
113
122
 
123
+ # =========================================================================
124
+ # Search Helper Methods
125
+ # =========================================================================
126
+
127
+ # Check if paragraph contains the given text
128
+ #
129
+ # @param query [String, Regexp] the search query
130
+ # @param case_sensitive [Boolean] whether search is case-sensitive (default: false)
131
+ # @return [Boolean] true if text is found
132
+ def contains?(query, case_sensitive: false)
133
+ pattern = build_search_pattern(query, case_sensitive)
134
+ text.match?(pattern)
135
+ end
136
+
137
+ # Check if paragraph mentions a specific person
138
+ #
139
+ # @param name [String] the person name to search for
140
+ # @param exact [Boolean] if true, requires exact match (default: false)
141
+ # @return [Boolean] true if person is mentioned
142
+ def mentions_person?(name, exact: false)
143
+ entity_match?(people, name, exact)
144
+ end
145
+
146
+ # Check if paragraph mentions a specific organization
147
+ #
148
+ # @param name [String] the organization name to search for
149
+ # @param exact [Boolean] if true, requires exact match (default: false)
150
+ # @return [Boolean] true if organization is mentioned
151
+ def mentions_org?(name, exact: false)
152
+ entity_match?(organizations, name, exact)
153
+ end
154
+
155
+ # Check if paragraph mentions a specific location
156
+ #
157
+ # @param name [String] the location name to search for
158
+ # @param exact [Boolean] if true, requires exact match (default: false)
159
+ # @return [Boolean] true if location is mentioned
160
+ def mentions_location?(name, exact: false)
161
+ entity_match?(locations, name, exact)
162
+ end
163
+
164
+ # Check if paragraph mentions any of the given entities
165
+ #
166
+ # @param person [String, nil] person name to check
167
+ # @param org [String, nil] organization name to check
168
+ # @param location [String, nil] location name to check
169
+ # @return [Boolean] true if any specified entity is mentioned
170
+ def mentions?(person: nil, org: nil, location: nil)
171
+ return false if person.nil? && org.nil? && location.nil?
172
+
173
+ (person && mentions_person?(person)) ||
174
+ (org && mentions_org?(org)) ||
175
+ (location && mentions_location?(location))
176
+ end
177
+
178
+ # Check if paragraph has any links
179
+ #
180
+ # @return [Boolean] true if paragraph contains links
181
+ def has_links?
182
+ links.any?
183
+ end
184
+
185
+ # Check if paragraph has any emphasis
186
+ #
187
+ # @return [Boolean] true if paragraph contains emphasized text
188
+ def has_emphasis?
189
+ emphasis.any?
190
+ end
191
+
192
+ # Check if paragraph has any strong/bold text
193
+ #
194
+ # @return [Boolean] true if paragraph contains strong text
195
+ def has_strong?
196
+ strong.any?
197
+ end
198
+
199
+ # Check if paragraph mentions any entities
200
+ #
201
+ # @return [Boolean] true if paragraph contains any person, org, or location references
202
+ def has_entities?
203
+ people.any? || organizations.any? || locations.any?
204
+ end
205
+
206
+ # Convert paragraph to a Hash representation
207
+ #
208
+ # @return [Hash] the paragraph as a hash
209
+ def to_h
210
+ {
211
+ id: id,
212
+ text: text,
213
+ lead: lead? || nil,
214
+ word_count: word_count,
215
+ people: people.empty? ? nil : people,
216
+ organizations: organizations.empty? ? nil : organizations,
217
+ locations: locations.empty? ? nil : locations,
218
+ emphasis: emphasis.empty? ? nil : emphasis,
219
+ strong: strong.empty? ? nil : strong,
220
+ links: links.empty? ? nil : links
221
+ }.compact
222
+ end
223
+
114
224
  private
115
225
 
116
226
  # Extract all entities in a single DOM traversal
@@ -122,6 +232,7 @@ module NITFr
122
232
  @organizations = []
123
233
  @locations = []
124
234
  @emphasis = []
235
+ @strong = []
125
236
  @links = []
126
237
 
127
238
  traverse_for_entities(node)
@@ -147,6 +258,9 @@ module NITFr
147
258
  when "em"
148
259
  text = child.text&.strip
149
260
  @emphasis << text if text && !text.empty?
261
+ when "strong"
262
+ text = child.text&.strip
263
+ @strong << text if text && !text.empty?
150
264
  when "a"
151
265
  @links << {
152
266
  text: child.text&.strip,
@@ -158,5 +272,20 @@ module NITFr
158
272
  traverse_for_entities(child)
159
273
  end
160
274
  end
275
+
276
+ # Check if any entity matches the given name
277
+ #
278
+ # @param entities [Array<String>] array of entity names
279
+ # @param name [String] name to search for
280
+ # @param exact [Boolean] require exact match
281
+ # @return [Boolean] true if match found
282
+ def entity_match?(entities, name, exact)
283
+ if exact
284
+ entities.any? { |e| e == name }
285
+ else
286
+ pattern = /#{Regexp.escape(name)}/i
287
+ entities.any? { |e| e.match?(pattern) }
288
+ end
289
+ end
161
290
  end
162
291
  end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NITFr
4
+ # Shared module for building search patterns from queries
5
+ #
6
+ # Provides consistent pattern building across Document and Paragraph
7
+ # search methods, with proper escaping and case sensitivity handling.
8
+ module SearchPattern
9
+ private
10
+
11
+ # Build a regex pattern from query
12
+ #
13
+ # @param query [String, Regexp] the search query
14
+ # @param case_sensitive [Boolean] whether search is case-sensitive
15
+ # @return [Regexp] compiled pattern
16
+ def build_search_pattern(query, case_sensitive)
17
+ if query.is_a?(Regexp)
18
+ if case_sensitive
19
+ query
20
+ else
21
+ # Preserve original flags while adding case insensitivity
22
+ Regexp.new(query.source, query.options | Regexp::IGNORECASE)
23
+ end
24
+ else
25
+ Regexp.new(Regexp.escape(query.to_s), case_sensitive ? nil : Regexp::IGNORECASE)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -6,9 +6,14 @@ module NITFr
6
6
  # REXML's built-in text method only returns direct text content,
7
7
  # not text from nested elements. This module provides a method
8
8
  # to recursively extract all text content.
9
+ #
10
+ # Preserves hard line breaks (<br/>) as newline characters.
9
11
  module TextExtractor
10
12
  # Extract all text content from an element and its descendants
11
13
  #
14
+ # Converts <br/> elements to newline characters to preserve
15
+ # intended line breaks within content.
16
+ #
12
17
  # @param element [REXML::Element] the element to extract text from
13
18
  # @return [String] the concatenated text content
14
19
  def extract_all_text(element)
@@ -17,7 +22,12 @@ module NITFr
17
22
  if child.is_a?(REXML::Text)
18
23
  result << child.value
19
24
  elsif child.is_a?(REXML::Element)
20
- result << extract_all_text(child)
25
+ # Convert <br/> to newline
26
+ if child.name == "br"
27
+ result << "\n"
28
+ else
29
+ result << extract_all_text(child)
30
+ end
21
31
  end
22
32
  end
23
33
  result
data/lib/nitfr/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NITFr
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
  end
data/lib/nitfr.rb CHANGED
@@ -13,6 +13,8 @@ end
13
13
  require_relative "nitfr/version"
14
14
  require_relative "nitfr/errors"
15
15
  require_relative "nitfr/text_extractor"
16
+ require_relative "nitfr/search_pattern"
17
+ require_relative "nitfr/exporter"
16
18
  require_relative "nitfr/document"
17
19
  require_relative "nitfr/head"
18
20
  require_relative "nitfr/body"
@@ -20,6 +22,7 @@ require_relative "nitfr/headline"
20
22
  require_relative "nitfr/byline"
21
23
  require_relative "nitfr/paragraph"
22
24
  require_relative "nitfr/media"
25
+ require_relative "nitfr/footnote"
23
26
  require_relative "nitfr/docdata"
24
27
 
25
28
  module NITFr