nitfr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +330 -0
- data/Rakefile +12 -0
- data/lib/nitfr/body.rb +191 -0
- data/lib/nitfr/byline.rb +66 -0
- data/lib/nitfr/docdata.rb +201 -0
- data/lib/nitfr/document.rb +173 -0
- data/lib/nitfr/errors.rb +12 -0
- data/lib/nitfr/head.rb +101 -0
- data/lib/nitfr/headline.rb +58 -0
- data/lib/nitfr/media.rb +139 -0
- data/lib/nitfr/paragraph.rb +162 -0
- data/lib/nitfr/text_extractor.rb +26 -0
- data/lib/nitfr/version.rb +5 -0
- data/lib/nitfr.rb +48 -0
- metadata +101 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 397b4e4f6de3e985e4d8bfbfe18ad663ab6bdd350536365a826728f87704c571
|
|
4
|
+
data.tar.gz: 775aab503543f0f678abbca1bb55e8420ec1ad829f7e2470a0f609dd2924798f
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 482f2856dd1c1e1854f2d6f463dcdcf46ed644b417934b46847bf96597c3b1cf6012a2a9001de5bdcdd0573e53c2cf34ab0177a74d3a645fd2d8a6d3a074aa35
|
|
7
|
+
data.tar.gz: 59ac46fea40da9ace71f9ce9663ac6c6275d5c268b589d4f3f583261e34d78fe74dc027105fb3a27d4389650a34b9585fd6df7369ebdbd012fb11c8c60a85b70
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2008-2025 Mark Turner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
# NITFr
|
|
2
|
+
|
|
3
|
+
[]()
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
A Ruby gem for parsing NITF (News Industry Text Format) XML files.
|
|
7
|
+
|
|
8
|
+
NITF is a standard XML format developed by the IPTC (International Press Telecommunications Council) for marking up news articles. NITFr makes it easy for Ruby applications to parse and extract content from NITF documents.
|
|
9
|
+
|
|
10
|
+
## Requirements
|
|
11
|
+
|
|
12
|
+
- Ruby 3.0 or higher
|
|
13
|
+
- No native extensions or external dependencies (pure Ruby using REXML)
|
|
14
|
+
|
|
15
|
+
## Security
|
|
16
|
+
|
|
17
|
+
NITFr is designed with security in mind:
|
|
18
|
+
|
|
19
|
+
- **XXE Protection**: REXML does not expand external entities by default, protecting against XML External Entity (XXE) attacks
|
|
20
|
+
- **Entity Expansion Limits**: Configured to prevent "Billion Laughs" and similar entity expansion attacks
|
|
21
|
+
- **No Code Execution**: The parser never evaluates or executes content from XML documents
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
Add this line to your application's Gemfile:
|
|
26
|
+
|
|
27
|
+
```ruby
|
|
28
|
+
gem 'nitfr'
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
And then execute:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
bundle install
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Or install it yourself:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
gem install nitfr
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Basic Parsing
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
require 'nitfr'
|
|
49
|
+
|
|
50
|
+
# Parse from a string
|
|
51
|
+
xml = File.read('article.xml')
|
|
52
|
+
doc = NITFr.parse(xml)
|
|
53
|
+
|
|
54
|
+
# Or parse directly from a file
|
|
55
|
+
doc = NITFr.parse_file('article.xml')
|
|
56
|
+
|
|
57
|
+
# With explicit encoding
|
|
58
|
+
doc = NITFr.parse_file('article.xml', encoding: 'ISO-8859-1')
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Accessing Content
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
# Get the headline
|
|
65
|
+
doc.headline # => "Revolutionary Technology Changes Industry"
|
|
66
|
+
doc.headlines.primary # => "Revolutionary Technology Changes Industry"
|
|
67
|
+
doc.headlines.secondary # => "Experts predict widespread adoption"
|
|
68
|
+
|
|
69
|
+
# Get byline information
|
|
70
|
+
doc.byline.text # => "By Jane Smith, Senior Reporter"
|
|
71
|
+
doc.byline.person # => "Jane Smith"
|
|
72
|
+
doc.byline.title # => "Senior Reporter"
|
|
73
|
+
|
|
74
|
+
# Get the article text
|
|
75
|
+
doc.paragraphs.each do |para|
|
|
76
|
+
puts para.text
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Or get all text at once
|
|
80
|
+
puts doc.text
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Working with Metadata
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
# Document metadata
|
|
87
|
+
doc.title # => "Sample News Article Title"
|
|
88
|
+
doc.doc_id # => "article-2024-001"
|
|
89
|
+
doc.issue_date # => #<Date: 2024-12-15>
|
|
90
|
+
|
|
91
|
+
# Copyright info
|
|
92
|
+
doc.docdata.copyright_holder # => "Example News Corp"
|
|
93
|
+
doc.docdata.copyright_year # => "2024"
|
|
94
|
+
|
|
95
|
+
# Urgency (1-8, 1 being most urgent)
|
|
96
|
+
doc.docdata.urgency # => 4
|
|
97
|
+
|
|
98
|
+
# Identified content
|
|
99
|
+
doc.docdata.subjects # => ["Technology", "Business"]
|
|
100
|
+
doc.docdata.organizations # => ["TechCorp Inc"]
|
|
101
|
+
doc.docdata.people # => ["John Doe"]
|
|
102
|
+
doc.docdata.locations # => ["San Francisco"]
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Working with Body Content
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
# Access the body section
|
|
109
|
+
body = doc.body
|
|
110
|
+
|
|
111
|
+
# Dateline and abstract
|
|
112
|
+
body.dateline # => "SAN FRANCISCO, Dec 15"
|
|
113
|
+
body.abstract # => "A new technology platform..."
|
|
114
|
+
|
|
115
|
+
# Block quotes
|
|
116
|
+
body.block_quotes # => ["Innovation distinguishes..."]
|
|
117
|
+
|
|
118
|
+
# Tagline from body.end
|
|
119
|
+
body.tagline # => "Contact: press@example.com"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Working with Paragraphs
|
|
123
|
+
|
|
124
|
+
```ruby
|
|
125
|
+
doc.paragraphs.each do |para|
|
|
126
|
+
# Check if it's the lead paragraph
|
|
127
|
+
puts "LEAD: " if para.lead?
|
|
128
|
+
|
|
129
|
+
# Get plain text
|
|
130
|
+
puts para.text
|
|
131
|
+
|
|
132
|
+
# Get entities mentioned in this paragraph
|
|
133
|
+
puts "People: #{para.people.join(', ')}"
|
|
134
|
+
puts "Organizations: #{para.organizations.join(', ')}"
|
|
135
|
+
puts "Locations: #{para.locations.join(', ')}"
|
|
136
|
+
|
|
137
|
+
# Get emphasized text
|
|
138
|
+
puts "Emphasized: #{para.emphasis.join(', ')}"
|
|
139
|
+
|
|
140
|
+
# Get links
|
|
141
|
+
para.links.each do |link|
|
|
142
|
+
puts "Link: #{link[:text]} -> #{link[:href]}"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Word count
|
|
146
|
+
puts "Words: #{para.word_count}"
|
|
147
|
+
end
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Working with Media
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
doc.media.each do |media|
|
|
154
|
+
puts "Caption: #{media.caption}"
|
|
155
|
+
puts "Credit: #{media.credit}"
|
|
156
|
+
puts "MIME type: #{media.mime_type}"
|
|
157
|
+
|
|
158
|
+
if media.image?
|
|
159
|
+
puts "Image: #{media.source}"
|
|
160
|
+
puts "Size: #{media.width}x#{media.height}"
|
|
161
|
+
puts "Alt text: #{media.alt_text}"
|
|
162
|
+
elsif media.video?
|
|
163
|
+
puts "Video: #{media.source}"
|
|
164
|
+
elsif media.audio?
|
|
165
|
+
puts "Audio: #{media.source}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Access all references (different sizes/formats)
|
|
169
|
+
media.references.each do |ref|
|
|
170
|
+
puts " #{ref[:source]} (#{ref[:mime_type]})"
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Error Handling
|
|
176
|
+
|
|
177
|
+
```ruby
|
|
178
|
+
begin
|
|
179
|
+
doc = NITFr.parse(xml)
|
|
180
|
+
rescue NITFr::ParseError => e
|
|
181
|
+
puts "Invalid XML: #{e.message}"
|
|
182
|
+
rescue NITFr::InvalidDocumentError => e
|
|
183
|
+
puts "Not a valid NITF document: #{e.message}"
|
|
184
|
+
end
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Document Attributes
|
|
188
|
+
|
|
189
|
+
```ruby
|
|
190
|
+
# NITF version and change information
|
|
191
|
+
doc.version # => "-//IPTC//DTD NITF 3.5//EN"
|
|
192
|
+
doc.change_date # => "October 18, 2007"
|
|
193
|
+
doc.change_time # => "19:30"
|
|
194
|
+
|
|
195
|
+
# Check validity
|
|
196
|
+
doc.valid? # => true
|
|
197
|
+
|
|
198
|
+
# Get raw XML
|
|
199
|
+
doc.to_xml # => "<?xml version..."
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Advanced Usage
|
|
203
|
+
|
|
204
|
+
### Head Section Details
|
|
205
|
+
|
|
206
|
+
```ruby
|
|
207
|
+
head = doc.head
|
|
208
|
+
|
|
209
|
+
# Meta tags as a hash
|
|
210
|
+
head.meta # => {"keywords" => "tech, news", "author" => "Jane"}
|
|
211
|
+
head.keywords # => ["tech, news"]
|
|
212
|
+
|
|
213
|
+
# Publication data
|
|
214
|
+
head.pubdata[:type] # => "print"
|
|
215
|
+
head.pubdata[:name] # => "Example Times"
|
|
216
|
+
head.pubdata[:edition] # => "Morning"
|
|
217
|
+
head.pubdata[:volume] # => "42"
|
|
218
|
+
|
|
219
|
+
# Revision history
|
|
220
|
+
head.revision_history.each do |rev|
|
|
221
|
+
puts "#{rev[:name]} (#{rev[:function]}): #{rev[:comment]}"
|
|
222
|
+
end
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Extended Docdata
|
|
226
|
+
|
|
227
|
+
```ruby
|
|
228
|
+
docdata = doc.docdata
|
|
229
|
+
|
|
230
|
+
# Additional dates
|
|
231
|
+
docdata.release_date # => #<Date: 2024-12-15>
|
|
232
|
+
docdata.expire_date # => #<Date: 2024-12-31>
|
|
233
|
+
|
|
234
|
+
# Document scope and fixture
|
|
235
|
+
docdata.doc_scope # => "national"
|
|
236
|
+
docdata.fixture # => "fixture-123"
|
|
237
|
+
|
|
238
|
+
# Series information
|
|
239
|
+
docdata.series[:name] # => "Investigation"
|
|
240
|
+
docdata.series[:part] # => 2
|
|
241
|
+
docdata.series[:total] # => 5
|
|
242
|
+
|
|
243
|
+
# Editorial status
|
|
244
|
+
docdata.management_status[:info] # => "Approved"
|
|
245
|
+
docdata.management_status[:message_type] # => "advisory"
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### Body Section Extras
|
|
249
|
+
|
|
250
|
+
```ruby
|
|
251
|
+
body = doc.body
|
|
252
|
+
|
|
253
|
+
# Distributor and series
|
|
254
|
+
body.distributor # => "Wire Service"
|
|
255
|
+
body.series[:name] # => "Special Report"
|
|
256
|
+
body.series[:part] # => "1"
|
|
257
|
+
body.series[:totalpart] # => "3"
|
|
258
|
+
|
|
259
|
+
# Lists in the content
|
|
260
|
+
body.lists.each do |list|
|
|
261
|
+
puts "#{list[:type]}: #{list[:items].join(', ')}"
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Tables (returns raw REXML elements)
|
|
265
|
+
body.tables.each do |table|
|
|
266
|
+
# Process table XML as needed
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Notes from body.end
|
|
270
|
+
body.notes # => ["Editor's note: ...", "Correction: ..."]
|
|
271
|
+
|
|
272
|
+
# Bibliography
|
|
273
|
+
body.body_end_content[:bibliography] # => ["Source 1", "Source 2"]
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## NITF Structure
|
|
277
|
+
|
|
278
|
+
A typical NITF document has this structure:
|
|
279
|
+
|
|
280
|
+
```xml
|
|
281
|
+
<nitf>
|
|
282
|
+
<head>
|
|
283
|
+
<title>...</title>
|
|
284
|
+
<docdata>
|
|
285
|
+
<doc-id id-string="..."/>
|
|
286
|
+
<date.issue norm="YYYYMMDD"/>
|
|
287
|
+
...
|
|
288
|
+
</docdata>
|
|
289
|
+
</head>
|
|
290
|
+
<body>
|
|
291
|
+
<body.head>
|
|
292
|
+
<headline>
|
|
293
|
+
<hl1>Primary Headline</hl1>
|
|
294
|
+
<hl2>Secondary Headline</hl2>
|
|
295
|
+
</headline>
|
|
296
|
+
<byline>By Author Name</byline>
|
|
297
|
+
<dateline>CITY, Date</dateline>
|
|
298
|
+
</body.head>
|
|
299
|
+
<body.content>
|
|
300
|
+
<p>Paragraph content...</p>
|
|
301
|
+
<media media-type="image">...</media>
|
|
302
|
+
</body.content>
|
|
303
|
+
<body.end>
|
|
304
|
+
<tagline>...</tagline>
|
|
305
|
+
</body.end>
|
|
306
|
+
</body>
|
|
307
|
+
</nitf>
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Development
|
|
311
|
+
|
|
312
|
+
After checking out the repo, install dependencies and run the tests:
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
bundle install
|
|
316
|
+
bundle exec rake test
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## Contributing
|
|
320
|
+
|
|
321
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/amerine/nitfr.
|
|
322
|
+
|
|
323
|
+
## License
|
|
324
|
+
|
|
325
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
326
|
+
|
|
327
|
+
## References
|
|
328
|
+
|
|
329
|
+
- [IPTC NITF Specification](https://iptc.org/standards/nitf/)
|
|
330
|
+
- [NITF 3.5 DTD](http://www.nitf.org/IPTC/NITF/3.5/specification/nitf-3-5.dtd)
|
data/Rakefile
ADDED
data/lib/nitfr/body.rb
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents the body section of an NITF document
|
|
5
|
+
#
|
|
6
|
+
# The body contains the main content of the news article including
|
|
7
|
+
# headline, byline, and the actual article text in body.content.
|
|
8
|
+
class Body
|
|
9
|
+
attr_reader :node
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Get the headline object
|
|
16
|
+
#
|
|
17
|
+
# @return [Headline, nil] the headline with all levels
|
|
18
|
+
def headline
|
|
19
|
+
@headline ||= begin
|
|
20
|
+
headline_node = body_head && xpath_first(body_head, "headline")
|
|
21
|
+
Headline.new(headline_node) if headline_node
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Get the byline object
|
|
26
|
+
#
|
|
27
|
+
# @return [Byline, nil] the byline information
|
|
28
|
+
def byline
|
|
29
|
+
@byline ||= begin
|
|
30
|
+
byline_node = body_head && xpath_first(body_head, "byline")
|
|
31
|
+
Byline.new(byline_node) if byline_node
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Get the dateline text
|
|
36
|
+
#
|
|
37
|
+
# @return [String, nil] the dateline
|
|
38
|
+
def dateline
|
|
39
|
+
@dateline ||= (body_head && xpath_first(body_head, "dateline"))&.text&.strip
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Get the abstract/summary
|
|
43
|
+
#
|
|
44
|
+
# @return [String, nil] the abstract text
|
|
45
|
+
def abstract
|
|
46
|
+
@abstract ||= (body_head && xpath_first(body_head, "abstract"))&.text&.strip
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Get distributor information
|
|
50
|
+
#
|
|
51
|
+
# @return [String, nil] the distributor
|
|
52
|
+
def distributor
|
|
53
|
+
@distributor ||= (body_head && xpath_first(body_head, "distributor"))&.text&.strip
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Get series information
|
|
57
|
+
#
|
|
58
|
+
# @return [Hash, nil] series metadata
|
|
59
|
+
def series
|
|
60
|
+
@series ||= parse_series
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Get all paragraphs from body.content
|
|
64
|
+
#
|
|
65
|
+
# @return [Array<Paragraph>] array of paragraph objects
|
|
66
|
+
def paragraphs
|
|
67
|
+
@paragraphs ||= begin
|
|
68
|
+
return [] unless body_content
|
|
69
|
+
|
|
70
|
+
xpath_match(body_content, ".//p").map { |p| Paragraph.new(p) }
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get all media objects from body.content
|
|
75
|
+
#
|
|
76
|
+
# @return [Array<Media>] array of media objects
|
|
77
|
+
def media
|
|
78
|
+
@media ||= begin
|
|
79
|
+
return [] unless body_content
|
|
80
|
+
|
|
81
|
+
xpath_match(body_content, ".//media").map { |m| Media.new(m) }
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Get all block quotes
|
|
86
|
+
#
|
|
87
|
+
# @return [Array<String>] array of block quote texts
|
|
88
|
+
def block_quotes
|
|
89
|
+
@block_quotes ||= begin
|
|
90
|
+
return [] unless body_content
|
|
91
|
+
|
|
92
|
+
xpath_match(body_content, ".//bq/block").map { |b| b.text&.strip }.compact
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Get all lists from the content
|
|
97
|
+
#
|
|
98
|
+
# @return [Array<Hash>] array of list structures
|
|
99
|
+
def lists
|
|
100
|
+
@lists ||= parse_lists
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Get all tables from the content
|
|
104
|
+
#
|
|
105
|
+
# @return [Array<REXML::Element>] raw table nodes
|
|
106
|
+
def tables
|
|
107
|
+
@tables ||= begin
|
|
108
|
+
return [] unless body_content
|
|
109
|
+
|
|
110
|
+
xpath_match(body_content, ".//table")
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Get the body.end content (tagline, bibliography)
|
|
115
|
+
#
|
|
116
|
+
# @return [Hash] body end content
|
|
117
|
+
def body_end_content
|
|
118
|
+
@body_end_content ||= parse_body_end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get the tagline
|
|
122
|
+
#
|
|
123
|
+
# @return [String, nil] the tagline text
|
|
124
|
+
def tagline
|
|
125
|
+
body_end_content[:tagline]
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Get notes from body.end
|
|
129
|
+
#
|
|
130
|
+
# @return [Array<String>] array of notes
|
|
131
|
+
def notes
|
|
132
|
+
body_end_content[:notes] || []
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def xpath_first(context, path)
|
|
138
|
+
REXML::XPath.first(context, path)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def xpath_match(context, path)
|
|
142
|
+
REXML::XPath.match(context, path)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def body_head
|
|
146
|
+
@body_head ||= xpath_first(node, "body.head")
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def body_content
|
|
150
|
+
@body_content ||= xpath_first(node, "body.content")
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def body_end
|
|
154
|
+
@body_end ||= xpath_first(node, "body.end")
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def parse_series
|
|
158
|
+
return nil unless body_head
|
|
159
|
+
|
|
160
|
+
series_node = xpath_first(body_head, "series")
|
|
161
|
+
return nil unless series_node
|
|
162
|
+
|
|
163
|
+
{
|
|
164
|
+
name: series_node.attributes["series.name"],
|
|
165
|
+
part: series_node.attributes["series.part"],
|
|
166
|
+
totalpart: series_node.attributes["series.totalpart"]
|
|
167
|
+
}.compact
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def parse_lists
|
|
171
|
+
return [] unless body_content
|
|
172
|
+
|
|
173
|
+
xpath_match(body_content, ".//ul | .//ol | .//dl").map do |list|
|
|
174
|
+
{
|
|
175
|
+
type: list.name,
|
|
176
|
+
items: xpath_match(list, ".//li | .//dt | .//dd").map { |item| item.text&.strip }.compact
|
|
177
|
+
}
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def parse_body_end
|
|
182
|
+
return {} unless body_end
|
|
183
|
+
|
|
184
|
+
{
|
|
185
|
+
tagline: xpath_first(body_end, "tagline")&.text&.strip,
|
|
186
|
+
notes: xpath_match(body_end, ".//note").map { |n| n.text&.strip }.compact,
|
|
187
|
+
bibliography: xpath_match(body_end, ".//biblio").map { |b| b.text&.strip }.compact
|
|
188
|
+
}.compact
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
data/lib/nitfr/byline.rb
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents byline information from an NITF document
|
|
5
|
+
#
|
|
6
|
+
# Bylines can contain the author name, their title/role,
|
|
7
|
+
# and additional location information.
|
|
8
|
+
class Byline
|
|
9
|
+
include TextExtractor
|
|
10
|
+
|
|
11
|
+
attr_reader :node
|
|
12
|
+
|
|
13
|
+
def initialize(node)
|
|
14
|
+
@node = node
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Get the full byline text
|
|
18
|
+
#
|
|
19
|
+
# @return [String, nil] the complete byline text
|
|
20
|
+
def text
|
|
21
|
+
@text ||= extract_all_text(node).strip
|
|
22
|
+
end
|
|
23
|
+
alias to_s text
|
|
24
|
+
|
|
25
|
+
# Get the person/author name from byttl element
|
|
26
|
+
#
|
|
27
|
+
# @return [String, nil] the author name
|
|
28
|
+
def person
|
|
29
|
+
@person ||= xpath_first("person")&.text&.strip
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get the byline title/role
|
|
33
|
+
#
|
|
34
|
+
# @return [String, nil] the title or role
|
|
35
|
+
def title
|
|
36
|
+
@title ||= xpath_first("byttl")&.text&.strip
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Get the location if specified
|
|
40
|
+
#
|
|
41
|
+
# @return [String, nil] the location
|
|
42
|
+
def location
|
|
43
|
+
@location ||= xpath_first("location")&.text&.strip
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get organization/affiliation
|
|
47
|
+
#
|
|
48
|
+
# @return [String, nil] the organization name
|
|
49
|
+
def org
|
|
50
|
+
@org ||= xpath_first("org")&.text&.strip
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Check if byline has content
|
|
54
|
+
#
|
|
55
|
+
# @return [Boolean] true if byline has text
|
|
56
|
+
def present?
|
|
57
|
+
!text.empty?
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def xpath_first(path)
|
|
63
|
+
REXML::XPath.first(node, path)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|