nitfr 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +232 -0
- data/lib/nitfr/body.rb +42 -0
- data/lib/nitfr/byline.rb +13 -0
- data/lib/nitfr/docdata.rb +22 -0
- data/lib/nitfr/document.rb +274 -0
- data/lib/nitfr/exporter.rb +257 -0
- data/lib/nitfr/footnote.rb +63 -0
- data/lib/nitfr/head.rb +14 -0
- data/lib/nitfr/headline.rb +40 -3
- data/lib/nitfr/media.rb +18 -0
- data/lib/nitfr/paragraph.rb +130 -1
- data/lib/nitfr/search_pattern.rb +29 -0
- data/lib/nitfr/text_extractor.rb +11 -1
- data/lib/nitfr/version.rb +1 -1
- data/lib/nitfr.rb +3 -0
- metadata +5 -1
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Provides export functionality for NITF documents
|
|
5
|
+
#
|
|
6
|
+
# Supports conversion to Markdown, plain text, and HTML formats.
|
|
7
|
+
module Exporter
|
|
8
|
+
# Convert document to Markdown format
|
|
9
|
+
#
|
|
10
|
+
# @return [String] Markdown representation of the document
|
|
11
|
+
def to_markdown
|
|
12
|
+
lines = []
|
|
13
|
+
|
|
14
|
+
# Title/Headline
|
|
15
|
+
if headline
|
|
16
|
+
lines << "# #{headline}"
|
|
17
|
+
lines << ""
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Byline
|
|
21
|
+
if byline&.text
|
|
22
|
+
lines << "*#{byline.text}*"
|
|
23
|
+
lines << ""
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Dateline
|
|
27
|
+
if body&.dateline
|
|
28
|
+
lines << "**#{body.dateline}**"
|
|
29
|
+
lines << ""
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Abstract
|
|
33
|
+
if body&.abstract
|
|
34
|
+
lines << "> #{body.abstract}"
|
|
35
|
+
lines << ""
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Paragraphs
|
|
39
|
+
paragraphs.each do |para|
|
|
40
|
+
lines << format_paragraph_markdown(para)
|
|
41
|
+
lines << ""
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Block quotes
|
|
45
|
+
body&.block_quotes&.each do |quote|
|
|
46
|
+
lines << "> #{quote}"
|
|
47
|
+
lines << ""
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Footnotes
|
|
51
|
+
if footnotes.any?
|
|
52
|
+
lines << "---"
|
|
53
|
+
lines << ""
|
|
54
|
+
footnotes.each do |fn|
|
|
55
|
+
label = fn.label || "*"
|
|
56
|
+
lines << "[#{label}]: #{fn.value}"
|
|
57
|
+
end
|
|
58
|
+
lines << ""
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
lines.join("\n").strip
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Convert document to plain text format
|
|
65
|
+
#
|
|
66
|
+
# @return [String] plain text representation of the document
|
|
67
|
+
def to_text
|
|
68
|
+
lines = []
|
|
69
|
+
|
|
70
|
+
# Title/Headline
|
|
71
|
+
if headline
|
|
72
|
+
lines << headline.upcase
|
|
73
|
+
lines << "=" * headline.length
|
|
74
|
+
lines << ""
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Byline
|
|
78
|
+
if byline&.text
|
|
79
|
+
lines << byline.text
|
|
80
|
+
lines << ""
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Dateline
|
|
84
|
+
if body&.dateline
|
|
85
|
+
lines << body.dateline
|
|
86
|
+
lines << ""
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Paragraphs
|
|
90
|
+
paragraphs.each do |para|
|
|
91
|
+
lines << para.text
|
|
92
|
+
lines << ""
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Block quotes
|
|
96
|
+
body&.block_quotes&.each do |quote|
|
|
97
|
+
lines << " \"#{quote}\""
|
|
98
|
+
lines << ""
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Footnotes
|
|
102
|
+
if footnotes.any?
|
|
103
|
+
lines << "-" * 40
|
|
104
|
+
lines << ""
|
|
105
|
+
footnotes.each do |fn|
|
|
106
|
+
label = fn.label || "*"
|
|
107
|
+
lines << "[#{label}] #{fn.value}"
|
|
108
|
+
end
|
|
109
|
+
lines << ""
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
lines.join("\n").strip
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Convert document to HTML format
|
|
116
|
+
#
|
|
117
|
+
# @param include_wrapper [Boolean] whether to include html/body tags (default: false)
|
|
118
|
+
# @return [String] HTML representation of the document
|
|
119
|
+
def to_html(include_wrapper: false)
|
|
120
|
+
html_parts = []
|
|
121
|
+
|
|
122
|
+
# Article container
|
|
123
|
+
html_parts << "<article>"
|
|
124
|
+
|
|
125
|
+
# Header section
|
|
126
|
+
html_parts << " <header>"
|
|
127
|
+
|
|
128
|
+
if headline
|
|
129
|
+
html_parts << " <h1>#{escape_html(headline)}</h1>"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
if byline&.text
|
|
133
|
+
html_parts << " <p class=\"byline\">#{escape_html(byline.text)}</p>"
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
if body&.dateline
|
|
137
|
+
html_parts << " <p class=\"dateline\">#{escape_html(body.dateline)}</p>"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
html_parts << " </header>"
|
|
141
|
+
|
|
142
|
+
# Abstract
|
|
143
|
+
if body&.abstract
|
|
144
|
+
html_parts << " <aside class=\"abstract\">"
|
|
145
|
+
html_parts << " <p>#{escape_html(body.abstract)}</p>"
|
|
146
|
+
html_parts << " </aside>"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Main content
|
|
150
|
+
html_parts << " <section class=\"content\">"
|
|
151
|
+
|
|
152
|
+
paragraphs.each do |para|
|
|
153
|
+
html_parts << format_paragraph_html(para)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Block quotes
|
|
157
|
+
body&.block_quotes&.each do |quote|
|
|
158
|
+
html_parts << " <blockquote>"
|
|
159
|
+
html_parts << " <p>#{escape_html(quote)}</p>"
|
|
160
|
+
html_parts << " </blockquote>"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
html_parts << " </section>"
|
|
164
|
+
|
|
165
|
+
# Footnotes
|
|
166
|
+
if footnotes.any?
|
|
167
|
+
html_parts << " <footer class=\"footnotes\">"
|
|
168
|
+
html_parts << " <ol>"
|
|
169
|
+
footnotes.each do |fn|
|
|
170
|
+
id_attr = fn.id ? " id=\"#{escape_html(fn.id)}\"" : ""
|
|
171
|
+
html_parts << " <li#{id_attr}>#{escape_html(fn.value)}</li>"
|
|
172
|
+
end
|
|
173
|
+
html_parts << " </ol>"
|
|
174
|
+
html_parts << " </footer>"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
html_parts << "</article>"
|
|
178
|
+
|
|
179
|
+
content = html_parts.join("\n")
|
|
180
|
+
|
|
181
|
+
if include_wrapper
|
|
182
|
+
wrap_html(content)
|
|
183
|
+
else
|
|
184
|
+
content
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
private
|
|
189
|
+
|
|
190
|
+
def format_paragraph_markdown(para)
|
|
191
|
+
text = para.text
|
|
192
|
+
|
|
193
|
+
# Add emphasis markers
|
|
194
|
+
para.emphasis.each do |em|
|
|
195
|
+
text = text.gsub(em, "*#{em}*")
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Add strong markers
|
|
199
|
+
para.strong.each do |strong|
|
|
200
|
+
text = text.gsub(strong, "**#{strong}**")
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
text
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def format_paragraph_html(para)
|
|
207
|
+
text = escape_html(para.text)
|
|
208
|
+
|
|
209
|
+
# Convert line breaks to <br>
|
|
210
|
+
text = text.gsub("\n", "<br>\n")
|
|
211
|
+
|
|
212
|
+
# Add emphasis tags
|
|
213
|
+
para.emphasis.each do |em|
|
|
214
|
+
escaped = escape_html(em)
|
|
215
|
+
text = text.gsub(escaped, "<em>#{escaped}</em>")
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Add strong tags
|
|
219
|
+
para.strong.each do |strong|
|
|
220
|
+
escaped = escape_html(strong)
|
|
221
|
+
text = text.gsub(escaped, "<strong>#{escaped}</strong>")
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
classes = []
|
|
225
|
+
classes << "lead" if para.lead?
|
|
226
|
+
|
|
227
|
+
class_attr = classes.any? ? " class=\"#{classes.join(' ')}\"" : ""
|
|
228
|
+
" <p#{class_attr}>#{text}</p>"
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def escape_html(text)
|
|
232
|
+
return "" if text.nil?
|
|
233
|
+
|
|
234
|
+
text.to_s
|
|
235
|
+
.gsub("&", "&")
|
|
236
|
+
.gsub("<", "<")
|
|
237
|
+
.gsub(">", ">")
|
|
238
|
+
.gsub('"', """)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def wrap_html(content)
|
|
242
|
+
<<~HTML
|
|
243
|
+
<!DOCTYPE html>
|
|
244
|
+
<html lang="en">
|
|
245
|
+
<head>
|
|
246
|
+
<meta charset="UTF-8">
|
|
247
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
248
|
+
<title>#{escape_html(title || headline || 'NITF Document')}</title>
|
|
249
|
+
</head>
|
|
250
|
+
<body>
|
|
251
|
+
#{content}
|
|
252
|
+
</body>
|
|
253
|
+
</html>
|
|
254
|
+
HTML
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents a footnote from an NITF document
|
|
5
|
+
#
|
|
6
|
+
# Footnotes can appear in body.content or body.end and contain
|
|
7
|
+
# a label (reference marker) and value (the footnote text).
|
|
8
|
+
class Footnote
|
|
9
|
+
attr_reader :node
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Get the footnote ID
|
|
16
|
+
#
|
|
17
|
+
# @return [String, nil] the footnote ID attribute
|
|
18
|
+
def id
|
|
19
|
+
node.attributes["id"]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Get the footnote label (reference marker)
|
|
23
|
+
#
|
|
24
|
+
# @return [String, nil] the label text (e.g., "1", "*", "a")
|
|
25
|
+
def label
|
|
26
|
+
@label ||= xpath_text("fn-label")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Get the footnote value (content)
|
|
30
|
+
#
|
|
31
|
+
# @return [String, nil] the footnote text content
|
|
32
|
+
def value
|
|
33
|
+
@value ||= xpath_text("fn-value")
|
|
34
|
+
end
|
|
35
|
+
alias text value
|
|
36
|
+
alias content value
|
|
37
|
+
|
|
38
|
+
# Check if footnote has content
|
|
39
|
+
#
|
|
40
|
+
# @return [Boolean] true if footnote has a value
|
|
41
|
+
def present?
|
|
42
|
+
!value.nil? && !value.empty?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Convert footnote to a Hash representation
|
|
46
|
+
#
|
|
47
|
+
# @return [Hash] the footnote as a hash
|
|
48
|
+
def to_h
|
|
49
|
+
{
|
|
50
|
+
id: id,
|
|
51
|
+
label: label,
|
|
52
|
+
value: value
|
|
53
|
+
}.compact
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def xpath_text(path)
|
|
59
|
+
element = REXML::XPath.first(node, path)
|
|
60
|
+
element&.text&.strip
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/nitfr/head.rb
CHANGED
|
@@ -60,6 +60,20 @@ module NITFr
|
|
|
60
60
|
end
|
|
61
61
|
end
|
|
62
62
|
|
|
63
|
+
# Convert head to a Hash representation
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash] the head as a hash
|
|
66
|
+
def to_h
|
|
67
|
+
{
|
|
68
|
+
title: title,
|
|
69
|
+
meta: meta.empty? ? nil : meta,
|
|
70
|
+
keywords: keywords.empty? ? nil : keywords,
|
|
71
|
+
pubdata: pubdata.empty? ? nil : pubdata,
|
|
72
|
+
revision_history: revision_history.empty? ? nil : revision_history,
|
|
73
|
+
docdata: docdata&.to_h
|
|
74
|
+
}.compact
|
|
75
|
+
end
|
|
76
|
+
|
|
63
77
|
private
|
|
64
78
|
|
|
65
79
|
def xpath_first(path)
|
data/lib/nitfr/headline.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module NITFr
|
|
4
4
|
# Represents headline information from an NITF document
|
|
5
5
|
#
|
|
6
|
-
# NITF supports multiple headline levels (hl1
|
|
6
|
+
# NITF supports multiple headline levels (hl1 through hl5) as well as
|
|
7
7
|
# headline (alternate headline) elements.
|
|
8
8
|
class Headline
|
|
9
9
|
attr_reader :node
|
|
@@ -28,11 +28,35 @@ module NITFr
|
|
|
28
28
|
end
|
|
29
29
|
alias hl2 secondary
|
|
30
30
|
|
|
31
|
+
# Get the tertiary headline (hl3)
|
|
32
|
+
#
|
|
33
|
+
# @return [String, nil] the tertiary headline text
|
|
34
|
+
def tertiary
|
|
35
|
+
@tertiary ||= xpath_first("hl3")&.text&.strip
|
|
36
|
+
end
|
|
37
|
+
alias hl3 tertiary
|
|
38
|
+
|
|
39
|
+
# Get the quaternary headline (hl4)
|
|
40
|
+
#
|
|
41
|
+
# @return [String, nil] the quaternary headline text
|
|
42
|
+
def quaternary
|
|
43
|
+
@quaternary ||= xpath_first("hl4")&.text&.strip
|
|
44
|
+
end
|
|
45
|
+
alias hl4 quaternary
|
|
46
|
+
|
|
47
|
+
# Get the quinary headline (hl5)
|
|
48
|
+
#
|
|
49
|
+
# @return [String, nil] the quinary headline text
|
|
50
|
+
def quinary
|
|
51
|
+
@quinary ||= xpath_first("hl5")&.text&.strip
|
|
52
|
+
end
|
|
53
|
+
alias hl5 quinary
|
|
54
|
+
|
|
31
55
|
# Get all headline levels as an array
|
|
32
56
|
#
|
|
33
57
|
# @return [Array<String>] array of headline texts in order
|
|
34
58
|
def all
|
|
35
|
-
@all ||= [primary, secondary].compact
|
|
59
|
+
@all ||= [primary, secondary, tertiary, quaternary, quinary].compact
|
|
36
60
|
end
|
|
37
61
|
|
|
38
62
|
# Get the full headline text (all levels joined)
|
|
@@ -46,7 +70,20 @@ module NITFr
|
|
|
46
70
|
#
|
|
47
71
|
# @return [Boolean] true if any headline text exists
|
|
48
72
|
def present?
|
|
49
|
-
|
|
73
|
+
all.any?
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Convert headline to a Hash representation
|
|
77
|
+
#
|
|
78
|
+
# @return [Hash] the headline as a hash
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
primary: primary,
|
|
82
|
+
secondary: secondary,
|
|
83
|
+
tertiary: tertiary,
|
|
84
|
+
quaternary: quaternary,
|
|
85
|
+
quinary: quinary
|
|
86
|
+
}.compact
|
|
50
87
|
end
|
|
51
88
|
|
|
52
89
|
private
|
data/lib/nitfr/media.rb
CHANGED
|
@@ -126,6 +126,24 @@ module NITFr
|
|
|
126
126
|
}.compact
|
|
127
127
|
end
|
|
128
128
|
|
|
129
|
+
# Convert media to a Hash representation
|
|
130
|
+
#
|
|
131
|
+
# @return [Hash] the media as a hash
|
|
132
|
+
def to_h
|
|
133
|
+
{
|
|
134
|
+
type: type,
|
|
135
|
+
source: source,
|
|
136
|
+
mime_type: mime_type,
|
|
137
|
+
width: width,
|
|
138
|
+
height: height,
|
|
139
|
+
alt_text: alt_text,
|
|
140
|
+
caption: caption,
|
|
141
|
+
credit: credit,
|
|
142
|
+
metadata: metadata.empty? ? nil : metadata,
|
|
143
|
+
references: references.size > 1 ? references : nil
|
|
144
|
+
}.compact
|
|
145
|
+
end
|
|
146
|
+
|
|
129
147
|
private
|
|
130
148
|
|
|
131
149
|
def xpath_first(path)
|
data/lib/nitfr/paragraph.rb
CHANGED
|
@@ -11,6 +11,7 @@ module NITFr
|
|
|
11
11
|
# arrays on first access to any entity method.
|
|
12
12
|
class Paragraph
|
|
13
13
|
include TextExtractor
|
|
14
|
+
include SearchPattern
|
|
14
15
|
|
|
15
16
|
attr_reader :node
|
|
16
17
|
|
|
@@ -48,7 +49,7 @@ module NITFr
|
|
|
48
49
|
lede == "true" || lede == "yes"
|
|
49
50
|
end
|
|
50
51
|
|
|
51
|
-
# Get any emphasized text within the paragraph
|
|
52
|
+
# Get any emphasized text within the paragraph (em tags)
|
|
52
53
|
#
|
|
53
54
|
# @return [Array<String>] array of emphasized text
|
|
54
55
|
def emphasis
|
|
@@ -56,6 +57,14 @@ module NITFr
|
|
|
56
57
|
@emphasis
|
|
57
58
|
end
|
|
58
59
|
|
|
60
|
+
# Get any strong/bold text within the paragraph (strong tags)
|
|
61
|
+
#
|
|
62
|
+
# @return [Array<String>] array of strong text
|
|
63
|
+
def strong
|
|
64
|
+
extract_entities unless @entities_extracted
|
|
65
|
+
@strong
|
|
66
|
+
end
|
|
67
|
+
|
|
59
68
|
# Get any links within the paragraph
|
|
60
69
|
#
|
|
61
70
|
# @return [Array<Hash>] array of link info hashes
|
|
@@ -111,6 +120,107 @@ module NITFr
|
|
|
111
120
|
text.split(/\s+/).size
|
|
112
121
|
end
|
|
113
122
|
|
|
123
|
+
# =========================================================================
|
|
124
|
+
# Search Helper Methods
|
|
125
|
+
# =========================================================================
|
|
126
|
+
|
|
127
|
+
# Check if paragraph contains the given text
|
|
128
|
+
#
|
|
129
|
+
# @param query [String, Regexp] the search query
|
|
130
|
+
# @param case_sensitive [Boolean] whether search is case-sensitive (default: false)
|
|
131
|
+
# @return [Boolean] true if text is found
|
|
132
|
+
def contains?(query, case_sensitive: false)
|
|
133
|
+
pattern = build_search_pattern(query, case_sensitive)
|
|
134
|
+
text.match?(pattern)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Check if paragraph mentions a specific person
|
|
138
|
+
#
|
|
139
|
+
# @param name [String] the person name to search for
|
|
140
|
+
# @param exact [Boolean] if true, requires exact match (default: false)
|
|
141
|
+
# @return [Boolean] true if person is mentioned
|
|
142
|
+
def mentions_person?(name, exact: false)
|
|
143
|
+
entity_match?(people, name, exact)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Check if paragraph mentions a specific organization
|
|
147
|
+
#
|
|
148
|
+
# @param name [String] the organization name to search for
|
|
149
|
+
# @param exact [Boolean] if true, requires exact match (default: false)
|
|
150
|
+
# @return [Boolean] true if organization is mentioned
|
|
151
|
+
def mentions_org?(name, exact: false)
|
|
152
|
+
entity_match?(organizations, name, exact)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Check if paragraph mentions a specific location
|
|
156
|
+
#
|
|
157
|
+
# @param name [String] the location name to search for
|
|
158
|
+
# @param exact [Boolean] if true, requires exact match (default: false)
|
|
159
|
+
# @return [Boolean] true if location is mentioned
|
|
160
|
+
def mentions_location?(name, exact: false)
|
|
161
|
+
entity_match?(locations, name, exact)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Check if paragraph mentions any of the given entities
|
|
165
|
+
#
|
|
166
|
+
# @param person [String, nil] person name to check
|
|
167
|
+
# @param org [String, nil] organization name to check
|
|
168
|
+
# @param location [String, nil] location name to check
|
|
169
|
+
# @return [Boolean] true if any specified entity is mentioned
|
|
170
|
+
def mentions?(person: nil, org: nil, location: nil)
|
|
171
|
+
return false if person.nil? && org.nil? && location.nil?
|
|
172
|
+
|
|
173
|
+
(person && mentions_person?(person)) ||
|
|
174
|
+
(org && mentions_org?(org)) ||
|
|
175
|
+
(location && mentions_location?(location))
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Check if paragraph has any links
|
|
179
|
+
#
|
|
180
|
+
# @return [Boolean] true if paragraph contains links
|
|
181
|
+
def has_links?
|
|
182
|
+
links.any?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Check if paragraph has any emphasis
|
|
186
|
+
#
|
|
187
|
+
# @return [Boolean] true if paragraph contains emphasized text
|
|
188
|
+
def has_emphasis?
|
|
189
|
+
emphasis.any?
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Check if paragraph has any strong/bold text
|
|
193
|
+
#
|
|
194
|
+
# @return [Boolean] true if paragraph contains strong text
|
|
195
|
+
def has_strong?
|
|
196
|
+
strong.any?
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Check if paragraph mentions any entities
|
|
200
|
+
#
|
|
201
|
+
# @return [Boolean] true if paragraph contains any person, org, or location references
|
|
202
|
+
def has_entities?
|
|
203
|
+
people.any? || organizations.any? || locations.any?
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Convert paragraph to a Hash representation
|
|
207
|
+
#
|
|
208
|
+
# @return [Hash] the paragraph as a hash
|
|
209
|
+
def to_h
|
|
210
|
+
{
|
|
211
|
+
id: id,
|
|
212
|
+
text: text,
|
|
213
|
+
lead: lead? || nil,
|
|
214
|
+
word_count: word_count,
|
|
215
|
+
people: people.empty? ? nil : people,
|
|
216
|
+
organizations: organizations.empty? ? nil : organizations,
|
|
217
|
+
locations: locations.empty? ? nil : locations,
|
|
218
|
+
emphasis: emphasis.empty? ? nil : emphasis,
|
|
219
|
+
strong: strong.empty? ? nil : strong,
|
|
220
|
+
links: links.empty? ? nil : links
|
|
221
|
+
}.compact
|
|
222
|
+
end
|
|
223
|
+
|
|
114
224
|
private
|
|
115
225
|
|
|
116
226
|
# Extract all entities in a single DOM traversal
|
|
@@ -122,6 +232,7 @@ module NITFr
|
|
|
122
232
|
@organizations = []
|
|
123
233
|
@locations = []
|
|
124
234
|
@emphasis = []
|
|
235
|
+
@strong = []
|
|
125
236
|
@links = []
|
|
126
237
|
|
|
127
238
|
traverse_for_entities(node)
|
|
@@ -147,6 +258,9 @@ module NITFr
|
|
|
147
258
|
when "em"
|
|
148
259
|
text = child.text&.strip
|
|
149
260
|
@emphasis << text if text && !text.empty?
|
|
261
|
+
when "strong"
|
|
262
|
+
text = child.text&.strip
|
|
263
|
+
@strong << text if text && !text.empty?
|
|
150
264
|
when "a"
|
|
151
265
|
@links << {
|
|
152
266
|
text: child.text&.strip,
|
|
@@ -158,5 +272,20 @@ module NITFr
|
|
|
158
272
|
traverse_for_entities(child)
|
|
159
273
|
end
|
|
160
274
|
end
|
|
275
|
+
|
|
276
|
+
# Check if any entity matches the given name
|
|
277
|
+
#
|
|
278
|
+
# @param entities [Array<String>] array of entity names
|
|
279
|
+
# @param name [String] name to search for
|
|
280
|
+
# @param exact [Boolean] require exact match
|
|
281
|
+
# @return [Boolean] true if match found
|
|
282
|
+
def entity_match?(entities, name, exact)
|
|
283
|
+
if exact
|
|
284
|
+
entities.any? { |e| e == name }
|
|
285
|
+
else
|
|
286
|
+
pattern = /#{Regexp.escape(name)}/i
|
|
287
|
+
entities.any? { |e| e.match?(pattern) }
|
|
288
|
+
end
|
|
289
|
+
end
|
|
161
290
|
end
|
|
162
291
|
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Shared module for building search patterns from queries
|
|
5
|
+
#
|
|
6
|
+
# Provides consistent pattern building across Document and Paragraph
|
|
7
|
+
# search methods, with proper escaping and case sensitivity handling.
|
|
8
|
+
module SearchPattern
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# Build a regex pattern from query
|
|
12
|
+
#
|
|
13
|
+
# @param query [String, Regexp] the search query
|
|
14
|
+
# @param case_sensitive [Boolean] whether search is case-sensitive
|
|
15
|
+
# @return [Regexp] compiled pattern
|
|
16
|
+
def build_search_pattern(query, case_sensitive)
|
|
17
|
+
if query.is_a?(Regexp)
|
|
18
|
+
if case_sensitive
|
|
19
|
+
query
|
|
20
|
+
else
|
|
21
|
+
# Preserve original flags while adding case insensitivity
|
|
22
|
+
Regexp.new(query.source, query.options | Regexp::IGNORECASE)
|
|
23
|
+
end
|
|
24
|
+
else
|
|
25
|
+
Regexp.new(Regexp.escape(query.to_s), case_sensitive ? nil : Regexp::IGNORECASE)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
data/lib/nitfr/text_extractor.rb
CHANGED
|
@@ -6,9 +6,14 @@ module NITFr
|
|
|
6
6
|
# REXML's built-in text method only returns direct text content,
|
|
7
7
|
# not text from nested elements. This module provides a method
|
|
8
8
|
# to recursively extract all text content.
|
|
9
|
+
#
|
|
10
|
+
# Preserves hard line breaks (<br/>) as newline characters.
|
|
9
11
|
module TextExtractor
|
|
10
12
|
# Extract all text content from an element and its descendants
|
|
11
13
|
#
|
|
14
|
+
# Converts <br/> elements to newline characters to preserve
|
|
15
|
+
# intended line breaks within content.
|
|
16
|
+
#
|
|
12
17
|
# @param element [REXML::Element] the element to extract text from
|
|
13
18
|
# @return [String] the concatenated text content
|
|
14
19
|
def extract_all_text(element)
|
|
@@ -17,7 +22,12 @@ module NITFr
|
|
|
17
22
|
if child.is_a?(REXML::Text)
|
|
18
23
|
result << child.value
|
|
19
24
|
elsif child.is_a?(REXML::Element)
|
|
20
|
-
|
|
25
|
+
# Convert <br/> to newline
|
|
26
|
+
if child.name == "br"
|
|
27
|
+
result << "\n"
|
|
28
|
+
else
|
|
29
|
+
result << extract_all_text(child)
|
|
30
|
+
end
|
|
21
31
|
end
|
|
22
32
|
end
|
|
23
33
|
result
|
data/lib/nitfr/version.rb
CHANGED
data/lib/nitfr.rb
CHANGED
|
@@ -13,6 +13,8 @@ end
|
|
|
13
13
|
require_relative "nitfr/version"
|
|
14
14
|
require_relative "nitfr/errors"
|
|
15
15
|
require_relative "nitfr/text_extractor"
|
|
16
|
+
require_relative "nitfr/search_pattern"
|
|
17
|
+
require_relative "nitfr/exporter"
|
|
16
18
|
require_relative "nitfr/document"
|
|
17
19
|
require_relative "nitfr/head"
|
|
18
20
|
require_relative "nitfr/body"
|
|
@@ -20,6 +22,7 @@ require_relative "nitfr/headline"
|
|
|
20
22
|
require_relative "nitfr/byline"
|
|
21
23
|
require_relative "nitfr/paragraph"
|
|
22
24
|
require_relative "nitfr/media"
|
|
25
|
+
require_relative "nitfr/footnote"
|
|
23
26
|
require_relative "nitfr/docdata"
|
|
24
27
|
|
|
25
28
|
module NITFr
|