pdf-extract 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analysis/sections.rb +11 -4
- data/lib/spatial.rb +5 -2
- data/lib/view/xml_view.rb +11 -6
- metadata +3 -3
data/lib/analysis/sections.rb
CHANGED
@@ -51,14 +51,16 @@ module PdfExtract
|
|
51
51
|
end.flatten
|
52
52
|
end
|
53
53
|
|
54
|
-
def self.add_content_stats sections
|
54
|
+
def self.add_content_stats sections, page_count
|
55
55
|
sections.map do |section|
|
56
|
+
last_page = section[:components].max {|c| c[:page]}[:page]
|
56
57
|
content = Spatial.get_text_content section
|
57
58
|
Spatial.drop_spatial(section).merge({
|
58
59
|
:letter_ratio => Language.letter_ratio(content),
|
59
60
|
:year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
|
60
61
|
:name_ratio => Language.name_ratio(content),
|
61
|
-
:word_count => Language.word_count(content)
|
62
|
+
:word_count => Language.word_count(content),
|
63
|
+
:lateness => (last_page / page_count.to_f)
|
62
64
|
})
|
63
65
|
end
|
64
66
|
end
|
@@ -112,8 +114,13 @@ module PdfExtract
|
|
112
114
|
if !found.last.nil? && match?(found.last, region)
|
113
115
|
content = Spatial.merge_lines(found.last, region, {})
|
114
116
|
found.last.merge!(content)
|
117
|
+
|
118
|
+
found.last[:components] << Spatial.get_dimensions(region)
|
119
|
+
|
115
120
|
else
|
116
|
-
found << region
|
121
|
+
found << region.merge({
|
122
|
+
:components => [Spatial.get_dimensions(region)]
|
123
|
+
})
|
117
124
|
end
|
118
125
|
else
|
119
126
|
sections = sections + found
|
@@ -128,7 +135,7 @@ module PdfExtract
|
|
128
135
|
|
129
136
|
# We now have sections. Add information to them.
|
130
137
|
# add_content_types sections
|
131
|
-
sections = add_content_stats sections
|
138
|
+
sections = add_content_stats sections, pages.keys.count
|
132
139
|
|
133
140
|
# Score sections into categories based on their textual attributes.
|
134
141
|
ideals = {
|
data/lib/spatial.rb
CHANGED
@@ -84,7 +84,10 @@ module PdfExtract
|
|
84
84
|
:x => obj[:x],
|
85
85
|
:y => obj[:y],
|
86
86
|
:width => obj[:width],
|
87
|
-
:height => obj[:height]
|
87
|
+
:height => obj[:height],
|
88
|
+
:page => obj[:page],
|
89
|
+
:page_width => obj[:page_width],
|
90
|
+
:page_height => obj[:page_height]
|
88
91
|
}
|
89
92
|
end
|
90
93
|
|
@@ -104,7 +107,7 @@ module PdfExtract
|
|
104
107
|
elsif obj[:content]
|
105
108
|
obj[:content]
|
106
109
|
else
|
107
|
-
|
110
|
+
""
|
108
111
|
end
|
109
112
|
end
|
110
113
|
|
data/lib/view/xml_view.rb
CHANGED
@@ -6,15 +6,20 @@ require_relative '../language'
|
|
6
6
|
module PdfExtract
|
7
7
|
class XmlView < AbstractView
|
8
8
|
|
9
|
-
@@ignored_attributes = [:content
|
9
|
+
@@ignored_attributes = [:content]
|
10
|
+
|
11
|
+
@@parent_ignored_attributes = [:page, :page_width, :page_height]
|
10
12
|
|
11
13
|
@@numeric_attributes = [:x, :y, :width, :height, :line_height,
|
12
14
|
:page_height, :page_width, :x_offset, :y_offset,
|
13
15
|
:spacing, :letter_ratio, :cap_ratio, :year_ratio]
|
14
16
|
|
15
17
|
# Return renderable attributes
|
16
|
-
def get_xml_attributes obj
|
18
|
+
def get_xml_attributes obj, parent=true
|
17
19
|
attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
|
20
|
+
if parent
|
21
|
+
attribs = attribs.reject { |k, _| @@parent_ignored_attributes.include? k }
|
22
|
+
end
|
18
23
|
attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
|
19
24
|
attribs.each_pair do |k, v|
|
20
25
|
if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
|
@@ -79,8 +84,8 @@ module PdfExtract
|
|
79
84
|
builder.to_xml
|
80
85
|
end
|
81
86
|
|
82
|
-
def write_obj_to_xml obj, type, xml
|
83
|
-
xml.send singular_name(type.to_s), get_xml_attributes(obj) do
|
87
|
+
def write_obj_to_xml obj, type, xml, parent=true
|
88
|
+
xml.send singular_name(type.to_s), get_xml_attributes(obj, parent) do
|
84
89
|
|
85
90
|
unless @render_options[:outline]
|
86
91
|
if not @render_options[:lines]
|
@@ -93,10 +98,10 @@ module PdfExtract
|
|
93
98
|
get_nested_objs(obj).each do |name, nested_obj|
|
94
99
|
element_name = singular_name name.to_s
|
95
100
|
if nested_obj.kind_of? Hash
|
96
|
-
write_obj_to_xml nested_obj, element_name, xml
|
101
|
+
write_obj_to_xml nested_obj, element_name, xml, false
|
97
102
|
elsif nested_obj.kind_of? Array
|
98
103
|
nested_obj.each do |item|
|
99
|
-
write_obj_to_xml item, element_name, xml
|
104
|
+
write_obj_to_xml item, element_name, xml, false
|
100
105
|
end
|
101
106
|
end
|
102
107
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 6
|
9
|
+
version: 0.0.6
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Karl Jonathan Ward
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-11-02 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|