pdf-extract 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -51,14 +51,16 @@ module PdfExtract
51
51
  end.flatten
52
52
  end
53
53
 
54
- def self.add_content_stats sections
54
+ def self.add_content_stats sections, page_count
55
55
  sections.map do |section|
56
+ last_page = section[:components].max {|c| c[:page]}[:page]
56
57
  content = Spatial.get_text_content section
57
58
  Spatial.drop_spatial(section).merge({
58
59
  :letter_ratio => Language.letter_ratio(content),
59
60
  :year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
60
61
  :name_ratio => Language.name_ratio(content),
61
- :word_count => Language.word_count(content)
62
+ :word_count => Language.word_count(content),
63
+ :lateness => (last_page / page_count.to_f)
62
64
  })
63
65
  end
64
66
  end
@@ -112,8 +114,13 @@ module PdfExtract
112
114
  if !found.last.nil? && match?(found.last, region)
113
115
  content = Spatial.merge_lines(found.last, region, {})
114
116
  found.last.merge!(content)
117
+
118
+ found.last[:components] << Spatial.get_dimensions(region)
119
+
115
120
  else
116
- found << region
121
+ found << region.merge({
122
+ :components => [Spatial.get_dimensions(region)]
123
+ })
117
124
  end
118
125
  else
119
126
  sections = sections + found
@@ -128,7 +135,7 @@ module PdfExtract
128
135
 
129
136
  # We now have sections. Add information to them.
130
137
  # add_content_types sections
131
- sections = add_content_stats sections
138
+ sections = add_content_stats sections, pages.keys.count
132
139
 
133
140
  # Score sections into categories based on their textual attributes.
134
141
  ideals = {
data/lib/spatial.rb CHANGED
@@ -84,7 +84,10 @@ module PdfExtract
84
84
  :x => obj[:x],
85
85
  :y => obj[:y],
86
86
  :width => obj[:width],
87
- :height => obj[:height]
87
+ :height => obj[:height],
88
+ :page => obj[:page],
89
+ :page_width => obj[:page_width],
90
+ :page_height => obj[:page_height]
88
91
  }
89
92
  end
90
93
 
@@ -104,7 +107,7 @@ module PdfExtract
104
107
  elsif obj[:content]
105
108
  obj[:content]
106
109
  else
107
- obj
110
+ ""
108
111
  end
109
112
  end
110
113
 
data/lib/view/xml_view.rb CHANGED
@@ -6,15 +6,20 @@ require_relative '../language'
6
6
  module PdfExtract
7
7
  class XmlView < AbstractView
8
8
 
9
- @@ignored_attributes = [:content, :page, :page_width, :page_height]
9
+ @@ignored_attributes = [:content]
10
+
11
+ @@parent_ignored_attributes = [:page, :page_width, :page_height]
10
12
 
11
13
  @@numeric_attributes = [:x, :y, :width, :height, :line_height,
12
14
  :page_height, :page_width, :x_offset, :y_offset,
13
15
  :spacing, :letter_ratio, :cap_ratio, :year_ratio]
14
16
 
15
17
  # Return renderable attributes
16
- def get_xml_attributes obj
18
+ def get_xml_attributes obj, parent=true
17
19
  attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
20
+ if parent
21
+ attribs = attribs.reject { |k, _| @@parent_ignored_attributes.include? k }
22
+ end
18
23
  attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
19
24
  attribs.each_pair do |k, v|
20
25
  if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
@@ -79,8 +84,8 @@ module PdfExtract
79
84
  builder.to_xml
80
85
  end
81
86
 
82
- def write_obj_to_xml obj, type, xml
83
- xml.send singular_name(type.to_s), get_xml_attributes(obj) do
87
+ def write_obj_to_xml obj, type, xml, parent=true
88
+ xml.send singular_name(type.to_s), get_xml_attributes(obj, parent) do
84
89
 
85
90
  unless @render_options[:outline]
86
91
  if not @render_options[:lines]
@@ -93,10 +98,10 @@ module PdfExtract
93
98
  get_nested_objs(obj).each do |name, nested_obj|
94
99
  element_name = singular_name name.to_s
95
100
  if nested_obj.kind_of? Hash
96
- write_obj_to_xml nested_obj, element_name, xml
101
+ write_obj_to_xml nested_obj, element_name, xml, false
97
102
  elsif nested_obj.kind_of? Array
98
103
  nested_obj.each do |item|
99
- write_obj_to_xml item, element_name, xml
104
+ write_obj_to_xml item, element_name, xml, false
100
105
  end
101
106
  end
102
107
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 4
9
- version: 0.0.4
8
+ - 6
9
+ version: 0.0.6
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-10-24 00:00:00 +01:00
17
+ date: 2011-11-02 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency