pdf-extract 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,14 +51,16 @@ module PdfExtract
51
51
  end.flatten
52
52
  end
53
53
 
54
- def self.add_content_stats sections
54
+ def self.add_content_stats sections, page_count
55
55
  sections.map do |section|
56
+ last_page = section[:components].max {|c| c[:page]}[:page]
56
57
  content = Spatial.get_text_content section
57
58
  Spatial.drop_spatial(section).merge({
58
59
  :letter_ratio => Language.letter_ratio(content),
59
60
  :year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
60
61
  :name_ratio => Language.name_ratio(content),
61
- :word_count => Language.word_count(content)
62
+ :word_count => Language.word_count(content),
63
+ :lateness => (last_page / page_count.to_f)
62
64
  })
63
65
  end
64
66
  end
@@ -112,8 +114,13 @@ module PdfExtract
112
114
  if !found.last.nil? && match?(found.last, region)
113
115
  content = Spatial.merge_lines(found.last, region, {})
114
116
  found.last.merge!(content)
117
+
118
+ found.last[:components] << Spatial.get_dimensions(region)
119
+
115
120
  else
116
- found << region
121
+ found << region.merge({
122
+ :components => [Spatial.get_dimensions(region)]
123
+ })
117
124
  end
118
125
  else
119
126
  sections = sections + found
@@ -128,7 +135,7 @@ module PdfExtract
128
135
 
129
136
  # We now have sections. Add information to them.
130
137
  # add_content_types sections
131
- sections = add_content_stats sections
138
+ sections = add_content_stats sections, pages.keys.count
132
139
 
133
140
  # Score sections into categories based on their textual attributes.
134
141
  ideals = {
data/lib/spatial.rb CHANGED
@@ -84,7 +84,10 @@ module PdfExtract
84
84
  :x => obj[:x],
85
85
  :y => obj[:y],
86
86
  :width => obj[:width],
87
- :height => obj[:height]
87
+ :height => obj[:height],
88
+ :page => obj[:page],
89
+ :page_width => obj[:page_width],
90
+ :page_height => obj[:page_height]
88
91
  }
89
92
  end
90
93
 
@@ -104,7 +107,7 @@ module PdfExtract
104
107
  elsif obj[:content]
105
108
  obj[:content]
106
109
  else
107
- obj
110
+ ""
108
111
  end
109
112
  end
110
113
 
data/lib/view/xml_view.rb CHANGED
@@ -6,15 +6,20 @@ require_relative '../language'
6
6
  module PdfExtract
7
7
  class XmlView < AbstractView
8
8
 
9
- @@ignored_attributes = [:content, :page, :page_width, :page_height]
9
+ @@ignored_attributes = [:content]
10
+
11
+ @@parent_ignored_attributes = [:page, :page_width, :page_height]
10
12
 
11
13
  @@numeric_attributes = [:x, :y, :width, :height, :line_height,
12
14
  :page_height, :page_width, :x_offset, :y_offset,
13
15
  :spacing, :letter_ratio, :cap_ratio, :year_ratio]
14
16
 
15
17
  # Return renderable attributes
16
- def get_xml_attributes obj
18
+ def get_xml_attributes obj, parent=true
17
19
  attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
20
+ if parent
21
+ attribs = attribs.reject { |k, _| @@parent_ignored_attributes.include? k }
22
+ end
18
23
  attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
19
24
  attribs.each_pair do |k, v|
20
25
  if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
@@ -79,8 +84,8 @@ module PdfExtract
79
84
  builder.to_xml
80
85
  end
81
86
 
82
- def write_obj_to_xml obj, type, xml
83
- xml.send singular_name(type.to_s), get_xml_attributes(obj) do
87
+ def write_obj_to_xml obj, type, xml, parent=true
88
+ xml.send singular_name(type.to_s), get_xml_attributes(obj, parent) do
84
89
 
85
90
  unless @render_options[:outline]
86
91
  if not @render_options[:lines]
@@ -93,10 +98,10 @@ module PdfExtract
93
98
  get_nested_objs(obj).each do |name, nested_obj|
94
99
  element_name = singular_name name.to_s
95
100
  if nested_obj.kind_of? Hash
96
- write_obj_to_xml nested_obj, element_name, xml
101
+ write_obj_to_xml nested_obj, element_name, xml, false
97
102
  elsif nested_obj.kind_of? Array
98
103
  nested_obj.each do |item|
99
- write_obj_to_xml item, element_name, xml
104
+ write_obj_to_xml item, element_name, xml, false
100
105
  end
101
106
  end
102
107
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 4
9
- version: 0.0.4
8
+ - 6
9
+ version: 0.0.6
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-10-24 00:00:00 +01:00
17
+ date: 2011-11-02 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency