pdf-extract 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analysis/sections.rb +11 -4
- data/lib/spatial.rb +5 -2
- data/lib/view/xml_view.rb +11 -6
- metadata +3 -3
data/lib/analysis/sections.rb
CHANGED
@@ -51,14 +51,16 @@ module PdfExtract
|
|
51
51
|
end.flatten
|
52
52
|
end
|
53
53
|
|
54
|
-
def self.add_content_stats sections
|
54
|
+
def self.add_content_stats sections, page_count
|
55
55
|
sections.map do |section|
|
56
|
+
last_page = section[:components].max {|c| c[:page]}[:page]
|
56
57
|
content = Spatial.get_text_content section
|
57
58
|
Spatial.drop_spatial(section).merge({
|
58
59
|
:letter_ratio => Language.letter_ratio(content),
|
59
60
|
:year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
|
60
61
|
:name_ratio => Language.name_ratio(content),
|
61
|
-
:word_count => Language.word_count(content)
|
62
|
+
:word_count => Language.word_count(content),
|
63
|
+
:lateness => (last_page / page_count.to_f)
|
62
64
|
})
|
63
65
|
end
|
64
66
|
end
|
@@ -112,8 +114,13 @@ module PdfExtract
|
|
112
114
|
if !found.last.nil? && match?(found.last, region)
|
113
115
|
content = Spatial.merge_lines(found.last, region, {})
|
114
116
|
found.last.merge!(content)
|
117
|
+
|
118
|
+
found.last[:components] << Spatial.get_dimensions(region)
|
119
|
+
|
115
120
|
else
|
116
|
-
found << region
|
121
|
+
found << region.merge({
|
122
|
+
:components => [Spatial.get_dimensions(region)]
|
123
|
+
})
|
117
124
|
end
|
118
125
|
else
|
119
126
|
sections = sections + found
|
@@ -128,7 +135,7 @@ module PdfExtract
|
|
128
135
|
|
129
136
|
# We now have sections. Add information to them.
|
130
137
|
# add_content_types sections
|
131
|
-
sections = add_content_stats sections
|
138
|
+
sections = add_content_stats sections, pages.keys.count
|
132
139
|
|
133
140
|
# Score sections into categories based on their textual attributes.
|
134
141
|
ideals = {
|
data/lib/spatial.rb
CHANGED
@@ -84,7 +84,10 @@ module PdfExtract
|
|
84
84
|
:x => obj[:x],
|
85
85
|
:y => obj[:y],
|
86
86
|
:width => obj[:width],
|
87
|
-
:height => obj[:height]
|
87
|
+
:height => obj[:height],
|
88
|
+
:page => obj[:page],
|
89
|
+
:page_width => obj[:page_width],
|
90
|
+
:page_height => obj[:page_height]
|
88
91
|
}
|
89
92
|
end
|
90
93
|
|
@@ -104,7 +107,7 @@ module PdfExtract
|
|
104
107
|
elsif obj[:content]
|
105
108
|
obj[:content]
|
106
109
|
else
|
107
|
-
|
110
|
+
""
|
108
111
|
end
|
109
112
|
end
|
110
113
|
|
data/lib/view/xml_view.rb
CHANGED
@@ -6,15 +6,20 @@ require_relative '../language'
|
|
6
6
|
module PdfExtract
|
7
7
|
class XmlView < AbstractView
|
8
8
|
|
9
|
-
@@ignored_attributes = [:content
|
9
|
+
@@ignored_attributes = [:content]
|
10
|
+
|
11
|
+
@@parent_ignored_attributes = [:page, :page_width, :page_height]
|
10
12
|
|
11
13
|
@@numeric_attributes = [:x, :y, :width, :height, :line_height,
|
12
14
|
:page_height, :page_width, :x_offset, :y_offset,
|
13
15
|
:spacing, :letter_ratio, :cap_ratio, :year_ratio]
|
14
16
|
|
15
17
|
# Return renderable attributes
|
16
|
-
def get_xml_attributes obj
|
18
|
+
def get_xml_attributes obj, parent=true
|
17
19
|
attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
|
20
|
+
if parent
|
21
|
+
attribs = attribs.reject { |k, _| @@parent_ignored_attributes.include? k }
|
22
|
+
end
|
18
23
|
attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
|
19
24
|
attribs.each_pair do |k, v|
|
20
25
|
if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
|
@@ -79,8 +84,8 @@ module PdfExtract
|
|
79
84
|
builder.to_xml
|
80
85
|
end
|
81
86
|
|
82
|
-
def write_obj_to_xml obj, type, xml
|
83
|
-
xml.send singular_name(type.to_s), get_xml_attributes(obj) do
|
87
|
+
def write_obj_to_xml obj, type, xml, parent=true
|
88
|
+
xml.send singular_name(type.to_s), get_xml_attributes(obj, parent) do
|
84
89
|
|
85
90
|
unless @render_options[:outline]
|
86
91
|
if not @render_options[:lines]
|
@@ -93,10 +98,10 @@ module PdfExtract
|
|
93
98
|
get_nested_objs(obj).each do |name, nested_obj|
|
94
99
|
element_name = singular_name name.to_s
|
95
100
|
if nested_obj.kind_of? Hash
|
96
|
-
write_obj_to_xml nested_obj, element_name, xml
|
101
|
+
write_obj_to_xml nested_obj, element_name, xml, false
|
97
102
|
elsif nested_obj.kind_of? Array
|
98
103
|
nested_obj.each do |item|
|
99
|
-
write_obj_to_xml item, element_name, xml
|
104
|
+
write_obj_to_xml item, element_name, xml, false
|
100
105
|
end
|
101
106
|
end
|
102
107
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 6
|
9
|
+
version: 0.0.6
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Karl Jonathan Ward
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-11-02 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|