nitfr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +330 -0
- data/Rakefile +12 -0
- data/lib/nitfr/body.rb +191 -0
- data/lib/nitfr/byline.rb +66 -0
- data/lib/nitfr/docdata.rb +201 -0
- data/lib/nitfr/document.rb +173 -0
- data/lib/nitfr/errors.rb +12 -0
- data/lib/nitfr/head.rb +101 -0
- data/lib/nitfr/headline.rb +58 -0
- data/lib/nitfr/media.rb +139 -0
- data/lib/nitfr/paragraph.rb +162 -0
- data/lib/nitfr/text_extractor.rb +26 -0
- data/lib/nitfr/version.rb +5 -0
- data/lib/nitfr.rb +48 -0
- metadata +101 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Represents a paragraph from an NITF document body
|
|
5
|
+
#
|
|
6
|
+
# Paragraphs can contain inline elements like emphasis,
|
|
7
|
+
# links, and other markup.
|
|
8
|
+
#
|
|
9
|
+
# Entity extraction (people, organizations, locations, emphasis) uses
|
|
10
|
+
# lazy batch extraction - a single DOM traversal populates all entity
|
|
11
|
+
# arrays on first access to any entity method.
|
|
12
|
+
class Paragraph
|
|
13
|
+
include TextExtractor
|
|
14
|
+
|
|
15
|
+
attr_reader :node
|
|
16
|
+
|
|
17
|
+
def initialize(node)
|
|
18
|
+
@node = node
|
|
19
|
+
@entities_extracted = false
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Get the plain text content of the paragraph
|
|
23
|
+
#
|
|
24
|
+
# @return [String] the paragraph text with inline elements stripped
|
|
25
|
+
def text
|
|
26
|
+
@text ||= extract_all_text(node).strip
|
|
27
|
+
end
|
|
28
|
+
alias to_s text
|
|
29
|
+
|
|
30
|
+
# Get the paragraph ID if present
|
|
31
|
+
#
|
|
32
|
+
# @return [String, nil] the paragraph ID
|
|
33
|
+
def id
|
|
34
|
+
node.attributes["id"]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Get the paragraph's lede attribute (indicates lead paragraph)
|
|
38
|
+
#
|
|
39
|
+
# @return [String, nil] the lede value
|
|
40
|
+
def lede
|
|
41
|
+
node.attributes["lede"]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Check if this is a lead paragraph
|
|
45
|
+
#
|
|
46
|
+
# @return [Boolean] true if marked as lead
|
|
47
|
+
def lead?
|
|
48
|
+
lede == "true" || lede == "yes"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Get any emphasized text within the paragraph
|
|
52
|
+
#
|
|
53
|
+
# @return [Array<String>] array of emphasized text
|
|
54
|
+
def emphasis
|
|
55
|
+
extract_entities unless @entities_extracted
|
|
56
|
+
@emphasis
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Get any links within the paragraph
|
|
60
|
+
#
|
|
61
|
+
# @return [Array<Hash>] array of link info hashes
|
|
62
|
+
def links
|
|
63
|
+
extract_entities unless @entities_extracted
|
|
64
|
+
@links
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Get any person references in the paragraph
|
|
68
|
+
#
|
|
69
|
+
# @return [Array<String>] array of person names
|
|
70
|
+
def people
|
|
71
|
+
extract_entities unless @entities_extracted
|
|
72
|
+
@people
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Get any organization references in the paragraph
|
|
76
|
+
#
|
|
77
|
+
# @return [Array<String>] array of organization names
|
|
78
|
+
def organizations
|
|
79
|
+
extract_entities unless @entities_extracted
|
|
80
|
+
@organizations
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Get any location references in the paragraph
|
|
84
|
+
#
|
|
85
|
+
# @return [Array<String>] array of location names
|
|
86
|
+
def locations
|
|
87
|
+
extract_entities unless @entities_extracted
|
|
88
|
+
@locations
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Get the raw HTML/XML content of the paragraph
|
|
92
|
+
#
|
|
93
|
+
# @return [String] the inner XML
|
|
94
|
+
def inner_html
|
|
95
|
+
node.children.map(&:to_s).join
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Check if paragraph has content
|
|
99
|
+
#
|
|
100
|
+
# @return [Boolean] true if paragraph has text
|
|
101
|
+
def present?
|
|
102
|
+
!text.empty?
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Get word count for the paragraph
|
|
106
|
+
#
|
|
107
|
+
# @return [Integer] approximate word count
|
|
108
|
+
def word_count
|
|
109
|
+
return 0 if text.empty?
|
|
110
|
+
|
|
111
|
+
text.split(/\s+/).size
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
# Extract all entities in a single DOM traversal
|
|
117
|
+
#
|
|
118
|
+
# This is more efficient than running separate XPath queries
|
|
119
|
+
# for each entity type when multiple entity methods are called.
|
|
120
|
+
def extract_entities
|
|
121
|
+
@people = []
|
|
122
|
+
@organizations = []
|
|
123
|
+
@locations = []
|
|
124
|
+
@emphasis = []
|
|
125
|
+
@links = []
|
|
126
|
+
|
|
127
|
+
traverse_for_entities(node)
|
|
128
|
+
|
|
129
|
+
@entities_extracted = true
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Recursively traverse elements and extract entities
|
|
133
|
+
#
|
|
134
|
+
# @param element [REXML::Element] the element to traverse
|
|
135
|
+
def traverse_for_entities(element)
|
|
136
|
+
element.each_element do |child|
|
|
137
|
+
case child.name
|
|
138
|
+
when "person"
|
|
139
|
+
text = child.text&.strip
|
|
140
|
+
@people << text if text && !text.empty?
|
|
141
|
+
when "org"
|
|
142
|
+
text = child.text&.strip
|
|
143
|
+
@organizations << text if text && !text.empty?
|
|
144
|
+
when "location"
|
|
145
|
+
text = child.text&.strip
|
|
146
|
+
@locations << text if text && !text.empty?
|
|
147
|
+
when "em"
|
|
148
|
+
text = child.text&.strip
|
|
149
|
+
@emphasis << text if text && !text.empty?
|
|
150
|
+
when "a"
|
|
151
|
+
@links << {
|
|
152
|
+
text: child.text&.strip,
|
|
153
|
+
href: child.attributes["href"]
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Continue traversing for nested entities
|
|
158
|
+
traverse_for_entities(child)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NITFr
|
|
4
|
+
# Shared module for extracting text from REXML elements
|
|
5
|
+
#
|
|
6
|
+
# REXML's built-in text method only returns direct text content,
|
|
7
|
+
# not text from nested elements. This module provides a method
|
|
8
|
+
# to recursively extract all text content.
|
|
9
|
+
module TextExtractor
|
|
10
|
+
# Extract all text content from an element and its descendants
|
|
11
|
+
#
|
|
12
|
+
# @param element [REXML::Element] the element to extract text from
|
|
13
|
+
# @return [String] the concatenated text content
|
|
14
|
+
def extract_all_text(element)
|
|
15
|
+
result = +""
|
|
16
|
+
element.each_child do |child|
|
|
17
|
+
if child.is_a?(REXML::Text)
|
|
18
|
+
result << child.value
|
|
19
|
+
elsif child.is_a?(REXML::Element)
|
|
20
|
+
result << extract_all_text(child)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
result
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
data/lib/nitfr.rb
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rexml/document"
|
|
4
|
+
require "rexml/xpath"
|
|
5
|
+
|
|
6
|
+
# Configure REXML security settings at load time
|
|
7
|
+
# This protects against XML entity expansion attacks (Billion Laughs)
|
|
8
|
+
if defined?(REXML::Security)
|
|
9
|
+
REXML::Security.entity_expansion_limit = 100
|
|
10
|
+
REXML::Security.entity_expansion_text_limit = 10_000
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
require_relative "nitfr/version"
|
|
14
|
+
require_relative "nitfr/errors"
|
|
15
|
+
require_relative "nitfr/text_extractor"
|
|
16
|
+
require_relative "nitfr/document"
|
|
17
|
+
require_relative "nitfr/head"
|
|
18
|
+
require_relative "nitfr/body"
|
|
19
|
+
require_relative "nitfr/headline"
|
|
20
|
+
require_relative "nitfr/byline"
|
|
21
|
+
require_relative "nitfr/paragraph"
|
|
22
|
+
require_relative "nitfr/media"
|
|
23
|
+
require_relative "nitfr/docdata"
|
|
24
|
+
|
|
25
|
+
module NITFr
|
|
26
|
+
class << self
|
|
27
|
+
# Parse an NITF XML string and return a Document
|
|
28
|
+
#
|
|
29
|
+
# @param xml [String] the NITF XML content
|
|
30
|
+
# @return [Document] the parsed document
|
|
31
|
+
# @raise [ParseError] if the XML is invalid or not NITF
|
|
32
|
+
def parse(xml)
|
|
33
|
+
Document.new(xml)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Parse an NITF XML file and return a Document
|
|
37
|
+
#
|
|
38
|
+
# @param path [String] path to the NITF XML file
|
|
39
|
+
# @param encoding [String] the file encoding (default: UTF-8)
|
|
40
|
+
# @return [Document] the parsed document
|
|
41
|
+
# @raise [ParseError] if the file cannot be read or XML is invalid
|
|
42
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
43
|
+
def parse_file(path, encoding: "UTF-8")
|
|
44
|
+
xml = File.read(path, encoding: encoding)
|
|
45
|
+
parse(xml)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: nitfr
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Mark Turner
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rexml
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rake
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '13.0'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '13.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: test-unit
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '3.6'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3.6'
|
|
54
|
+
description: NITFr makes it easy for Ruby applications to parse and extract content
|
|
55
|
+
from NITF XML files, the standard format used in the news industry.
|
|
56
|
+
email:
|
|
57
|
+
- mark@amerine.net
|
|
58
|
+
executables: []
|
|
59
|
+
extensions: []
|
|
60
|
+
extra_rdoc_files: []
|
|
61
|
+
files:
|
|
62
|
+
- LICENSE
|
|
63
|
+
- README.md
|
|
64
|
+
- Rakefile
|
|
65
|
+
- lib/nitfr.rb
|
|
66
|
+
- lib/nitfr/body.rb
|
|
67
|
+
- lib/nitfr/byline.rb
|
|
68
|
+
- lib/nitfr/docdata.rb
|
|
69
|
+
- lib/nitfr/document.rb
|
|
70
|
+
- lib/nitfr/errors.rb
|
|
71
|
+
- lib/nitfr/head.rb
|
|
72
|
+
- lib/nitfr/headline.rb
|
|
73
|
+
- lib/nitfr/media.rb
|
|
74
|
+
- lib/nitfr/paragraph.rb
|
|
75
|
+
- lib/nitfr/text_extractor.rb
|
|
76
|
+
- lib/nitfr/version.rb
|
|
77
|
+
homepage: https://github.com/amerine/nitfr
|
|
78
|
+
licenses:
|
|
79
|
+
- MIT
|
|
80
|
+
metadata:
|
|
81
|
+
homepage_uri: https://github.com/amerine/nitfr
|
|
82
|
+
source_code_uri: https://github.com/amerine/nitfr
|
|
83
|
+
changelog_uri: https://github.com/amerine/nitfr/blob/master/CHANGELOG.md
|
|
84
|
+
rdoc_options: []
|
|
85
|
+
require_paths:
|
|
86
|
+
- lib
|
|
87
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
88
|
+
requirements:
|
|
89
|
+
- - ">="
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
version: 3.0.0
|
|
92
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - ">="
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '0'
|
|
97
|
+
requirements: []
|
|
98
|
+
rubygems_version: 3.6.7
|
|
99
|
+
specification_version: 4
|
|
100
|
+
summary: A Ruby gem for parsing NITF (News Industry Text Format) XML files
|
|
101
|
+
test_files: []
|