pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
data/bin/pdf-extract
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'commander/import'
|
4
|
+
require 'json'
|
5
|
+
require_relative '../lib/pdf-extract'
|
6
|
+
require_relative '../lib/references/resolve'
|
7
|
+
|
8
|
+
program :name, 'pdf-extract'
|
9
|
+
program :version, '0.0.1'
|
10
|
+
program :description, 'PDF content extraction toolkit'
|
11
|
+
|
12
|
+
semantic = ['resolved_references', 'references', 'titles', 'sections']
|
13
|
+
margins = ['top_margins', 'bottom_margins', 'left_margins', 'right_margins']
|
14
|
+
zones = ['headers', 'footers', 'bodies']
|
15
|
+
objects = ['characters', 'chunks', 'regions', 'columns'] + semantic + margins + zones
|
16
|
+
|
17
|
+
resolvers = {
|
18
|
+
"sigg" => PdfExtract::Resolve::Sigg,
|
19
|
+
"freecite" => PdfExtract::Resolve::FreeCite,
|
20
|
+
"stq" => PdfExtract::Resolve::SimpleTextQuery
|
21
|
+
}
|
22
|
+
|
23
|
+
outputs = {
|
24
|
+
:xml => proc { :stdout },
|
25
|
+
:pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" },
|
26
|
+
:png => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.png" }
|
27
|
+
}
|
28
|
+
|
29
|
+
commands = [
|
30
|
+
{
|
31
|
+
:name => "extract",
|
32
|
+
:view => :xml,
|
33
|
+
:description => "Extract objects as XML."
|
34
|
+
},
|
35
|
+
{
|
36
|
+
:name => "mark",
|
37
|
+
:view => :pdf,
|
38
|
+
:description => "Highlight bounding boxes of objects in a PDF."
|
39
|
+
},
|
40
|
+
{
|
41
|
+
:name => "annotate",
|
42
|
+
:view => :not_implemented,
|
43
|
+
:description => "Annotate a PDF with attributes of extracted objects."
|
44
|
+
}
|
45
|
+
]
|
46
|
+
|
47
|
+
$chosen_objects = []
|
48
|
+
|
49
|
+
$render_options = {}
|
50
|
+
|
51
|
+
$overrides = {}
|
52
|
+
|
53
|
+
objects.each do |o|
|
54
|
+
global_option "--#{o}" do |_|
|
55
|
+
$chosen_objects << o
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
global_option "--semantic" do |_| $chosen_objects += semantic end
|
60
|
+
global_option "--margins" do |_| $chosen_objects += margins end
|
61
|
+
global_option "--zones" do |_| $chosen_objects += zones end
|
62
|
+
|
63
|
+
global_option "--resolvers RESOLVERS" do |chosen_resolvers|
|
64
|
+
chosen_resolvers = chosen_resolvers.split ","
|
65
|
+
chosen_resolvers.each do |name|
|
66
|
+
fail "No such resolver #{resolver}" unless resolvers.key? name
|
67
|
+
end
|
68
|
+
chosen_resolvers.map! { |name| resolvers[name] }
|
69
|
+
PdfExtract::Resolve.resolvers = chosen_resolvers
|
70
|
+
end
|
71
|
+
|
72
|
+
global_option "--output FILE" do |filename|
|
73
|
+
$output = filename
|
74
|
+
end
|
75
|
+
|
76
|
+
global_option "--no-lines" do |_|
|
77
|
+
$render_options[:lines] = false
|
78
|
+
end
|
79
|
+
|
80
|
+
global_option "--precision DIGITS" do |digits|
|
81
|
+
$render_options[:round] = digits.to_i
|
82
|
+
end
|
83
|
+
|
84
|
+
global_option "--outline" do |_|
|
85
|
+
$render_options[:outline] = true
|
86
|
+
end
|
87
|
+
|
88
|
+
global_option "--set SETTING:VALUE" do |s|
|
89
|
+
(name, value) = s.split ":"
|
90
|
+
$overrides[name] = value
|
91
|
+
end
|
92
|
+
|
93
|
+
global_option "--config CONFIG_FILE" do |filename|
|
94
|
+
$config = filename
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.apply_settings pdf
|
98
|
+
if not $config.nil?
|
99
|
+
conf = JSON.parse File.open($config, "r").read
|
100
|
+
conf.each_pair do |setting, value|
|
101
|
+
pdf.set setting.to_sym, value, $config
|
102
|
+
end
|
103
|
+
end
|
104
|
+
$overrides.each_pair { |k,v| pdf.set k.to_sym, v, "command line" }
|
105
|
+
end
|
106
|
+
|
107
|
+
commands.each do |cmd|
|
108
|
+
command cmd[:name].to_sym do |c|
|
109
|
+
c.syntax = "pdf-extract #{cmd[:name]} [options] filename"
|
110
|
+
c.description = cmd[:description]
|
111
|
+
|
112
|
+
c.action do |args, options|
|
113
|
+
args.each do |filename|
|
114
|
+
$output = outputs[cmd[:view]].call(filename) if $output.nil?
|
115
|
+
opts = {:as => cmd[:view]}.merge $render_options
|
116
|
+
out = PdfExtract.view filename, opts do |pdf|
|
117
|
+
apply_settings pdf
|
118
|
+
$chosen_objects.each { |name| pdf.send name.to_sym }
|
119
|
+
end
|
120
|
+
|
121
|
+
if $output == :stdout
|
122
|
+
say out
|
123
|
+
else
|
124
|
+
PdfExtract.view_class(cmd[:view]).write(out, $output)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
command :settings do |c|
|
133
|
+
c.syntax = "pdf-extract settings [options]"
|
134
|
+
c.description = "Print settings that pdf-extract will use to screen."
|
135
|
+
|
136
|
+
c.action do |args, options|
|
137
|
+
pdf = PdfExtract::Pdf.new
|
138
|
+
apply_settings pdf
|
139
|
+
s = pdf.settings
|
140
|
+
say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
|
141
|
+
s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
|
142
|
+
say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
|
143
|
+
s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
File without changes
|
data/bin/some3.mask.pdf
ADDED
Binary file
|
data/bin/some5.mask.pdf
ADDED
Binary file
|
data/bin/some6.mask.pdf
ADDED
Binary file
|
data/bin/train.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# Train ideal attributes based on example input.
|
2
|
+
|
3
|
+
require_relative "../lib/language"
|
4
|
+
|
5
|
+
variables = {
|
6
|
+
:name_ratio => method(PdfExtract::Language::name_ratio),
|
7
|
+
:letter_ratio => method(PdfExtract::Language::letter_ratio),
|
8
|
+
:year_ratio => method(PdfExtract::Language::year_ratio)
|
9
|
+
}
|
10
|
+
|
11
|
+
results = {}
|
12
|
+
sums = {}
|
13
|
+
variables.each_pair do |k, _|
|
14
|
+
sums[k] = 0
|
15
|
+
results[k] = []
|
16
|
+
end
|
17
|
+
|
18
|
+
count = 0
|
19
|
+
|
20
|
+
File.open(ARGV[0]).read.lines.each do |line|
|
21
|
+
variables.each_pair do |var, fn|
|
22
|
+
val = fn.call(line)
|
23
|
+
results[var] << val
|
24
|
+
sums[var] = val
|
25
|
+
end
|
26
|
+
|
27
|
+
count = count.next
|
28
|
+
end
|
29
|
+
|
30
|
+
avgs = {}
|
31
|
+
sums.each_pair { |k, _| avgs[k] = sums[k] / count }
|
32
|
+
|
33
|
+
deviations = {}
|
34
|
+
results.each_pair do |name, vals|
|
35
|
+
deviations[name] = results[name].map { |val| (args[name - val]) ** 2 }
|
36
|
+
end
|
37
|
+
|
38
|
+
std_deviations = {}
|
39
|
+
deviations.each_pair do |name, vals|
|
40
|
+
sum = 0
|
41
|
+
vals.each { |val| sum += val }
|
42
|
+
std_deviations[name] = (sum / (count - 1).to_f).sqrt
|
43
|
+
end
|
44
|
+
|
45
|
+
puts "Averages"
|
46
|
+
puts avgs
|
47
|
+
puts "Standard deviations"
|
48
|
+
puts std_deviations
|
Binary file
|
data/data/familynames.db
ADDED
Binary file
|
data/data/stopwords.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,association,company,org,organisation,president,vice,nobel,prize,medicine,biology,physics,chemistry,laboratories,labs
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module PdfExtract
|
2
|
+
module Columns
|
3
|
+
|
4
|
+
Settings.default :column_sample_count, 8
|
5
|
+
Settings.default :max_column_count, 3
|
6
|
+
|
7
|
+
def self.columns_at y, body_regions
|
8
|
+
x_mask = MultiRange.new
|
9
|
+
|
10
|
+
body_regions.each do |region|
|
11
|
+
if region[:y] <= y && (region[:y] + region[:height]) >= y
|
12
|
+
x_mask.append(region[:x] .. (region[:x] + region[:width]))
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
x_mask
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.include_in pdf
|
20
|
+
deps = [:regions, :bodies]
|
21
|
+
pdf.spatials :columns, :paged => true, :depends_on => deps do |parser|
|
22
|
+
|
23
|
+
body = nil
|
24
|
+
body_regions = []
|
25
|
+
|
26
|
+
parser.before do
|
27
|
+
body_regions = []
|
28
|
+
end
|
29
|
+
|
30
|
+
parser.objects :bodies do |b|
|
31
|
+
body = b
|
32
|
+
end
|
33
|
+
|
34
|
+
parser.objects :regions do |region|
|
35
|
+
if Spatial.contains? body, region
|
36
|
+
body_regions << region
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
parser.after do
|
41
|
+
column_sample_count = pdf.settings[:column_sample_count]
|
42
|
+
|
43
|
+
step = 1.0 / (column_sample_count + 1)
|
44
|
+
column_ranges = []
|
45
|
+
|
46
|
+
(1 .. column_sample_count).each do |i|
|
47
|
+
y = body[:y] + (body[:height] * i * step)
|
48
|
+
column_ranges << columns_at(y, body_regions)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Discard those with more than x columns. They've probably hit a table.
|
52
|
+
column_ranges.reject! { |r| r.count > pdf.settings[:max_column_count] }
|
53
|
+
|
54
|
+
if column_ranges.count.zero?
|
55
|
+
[]
|
56
|
+
else
|
57
|
+
# Find the highest column count.
|
58
|
+
most = column_ranges.max_by { |r| r.count }.count
|
59
|
+
column_ranges.reject! { |r| r.count != most }
|
60
|
+
|
61
|
+
# Take the columns that are widest.
|
62
|
+
widest = column_ranges.map { |r| r.avg }.max
|
63
|
+
column_ranges.reject! { |r| r.avg < widest }
|
64
|
+
|
65
|
+
column_ranges.first.ranges.map do |range|
|
66
|
+
body.merge({:x => range.min, :width => range.max - range.min })
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require_relative '../multi_range'
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module Margins
|
5
|
+
|
6
|
+
def self.axis_spatials pdf, name, axis
|
7
|
+
pdf.spatials name, :paged => true, :depends_on => [:regions] do |parser|
|
8
|
+
axis_mask = MultiRange.new
|
9
|
+
page = -1
|
10
|
+
page_width = 0
|
11
|
+
page_height = 0
|
12
|
+
|
13
|
+
dimension = :width if axis == :x
|
14
|
+
dimension = :height if axis == :y
|
15
|
+
|
16
|
+
parser.before do
|
17
|
+
axis_mask = MultiRange.new
|
18
|
+
page = -1
|
19
|
+
end
|
20
|
+
|
21
|
+
parser.objects :regions do |region|
|
22
|
+
if page == -1
|
23
|
+
page = region[:page]
|
24
|
+
page_width = region[:page_width]
|
25
|
+
page_height = region[:page_height]
|
26
|
+
end
|
27
|
+
|
28
|
+
axis_mask.append region[axis]..(region[axis]+region[dimension])
|
29
|
+
end
|
30
|
+
|
31
|
+
parser.after do
|
32
|
+
if axis_mask.count.zero?
|
33
|
+
nil
|
34
|
+
else
|
35
|
+
yield axis_mask, {
|
36
|
+
:page => page,
|
37
|
+
:page_width => page_width,
|
38
|
+
:page_height => page_height
|
39
|
+
}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.include_in pdf
|
46
|
+
axis_spatials pdf, :top_margins, :y do |y_mask, obj|
|
47
|
+
obj.merge({
|
48
|
+
:x => 0,
|
49
|
+
:y => y_mask.max,
|
50
|
+
:width => obj[:page_width],
|
51
|
+
:height => obj[:page_height] - y_mask.max
|
52
|
+
})
|
53
|
+
end
|
54
|
+
|
55
|
+
axis_spatials pdf, :bottom_margins, :y do |y_mask, obj|
|
56
|
+
obj.merge({
|
57
|
+
:x => 0,
|
58
|
+
:y => 0,
|
59
|
+
:width => obj[:page_width],
|
60
|
+
:height => y_mask.min
|
61
|
+
})
|
62
|
+
end
|
63
|
+
|
64
|
+
axis_spatials pdf, :left_margins, :x do |x_mask, obj|
|
65
|
+
obj.merge({
|
66
|
+
:x => 0,
|
67
|
+
:y => 0,
|
68
|
+
:width => x_mask.min,
|
69
|
+
:height => obj[:page_height]
|
70
|
+
})
|
71
|
+
end
|
72
|
+
|
73
|
+
axis_spatials pdf, :right_margins, :x do |x_mask, obj|
|
74
|
+
obj.merge({
|
75
|
+
:x => x_mask.max,
|
76
|
+
:y => 0,
|
77
|
+
:width => obj[:page_width] - x_mask.max,
|
78
|
+
:height => obj[:page_height]
|
79
|
+
})
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require_relative '../language'
|
2
|
+
require_relative '../spatial'
|
3
|
+
require_relative '../kmeans'
|
4
|
+
|
5
|
+
module PdfExtract
|
6
|
+
module Sections
|
7
|
+
|
8
|
+
Settings.default :width_ratio, 0.9
|
9
|
+
|
10
|
+
def self.match? a, b
|
11
|
+
lh = a[:line_height].round(2) == b[:line_height].round(2)
|
12
|
+
f = a[:font] == b[:font]
|
13
|
+
lh && f
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.candidate? pdf, region, column
|
17
|
+
# Regions that make up sections or headers must be
|
18
|
+
# both less width than their column width and,
|
19
|
+
# unless they are a single line, must be within the
|
20
|
+
# width_ratio.
|
21
|
+
width_ratio = pdf.settings[:width_ratio]
|
22
|
+
within_column = region[:width] <= column[:width]
|
23
|
+
within_column && (region[:width].to_f / column[:width]) >= width_ratio
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.reference_cluster clusters
|
27
|
+
# Find the cluster with name_ratio closest to 0.1
|
28
|
+
# Those are our reference sections.
|
29
|
+
ideal = 0.1
|
30
|
+
ref_cluster = nil
|
31
|
+
smallest_diff = 1
|
32
|
+
|
33
|
+
clusters.each do |cluster|
|
34
|
+
diff = (cluster[:centre][:name_ratio] - ideal).abs
|
35
|
+
if diff < smallest_diff
|
36
|
+
ref_cluster = cluster
|
37
|
+
smallest_diff = diff
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
ref_cluster
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.clusters_to_spatials clusters
|
45
|
+
clusters.map do |cluster|
|
46
|
+
cluster[:items].each do |item|
|
47
|
+
centre = cluster[:centre].values.map { |v| v.round(3) }.join ", "
|
48
|
+
item[:centre] = centre
|
49
|
+
end
|
50
|
+
cluster[:items]
|
51
|
+
end.flatten
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.add_content_stats sections
|
55
|
+
sections.map do |section|
|
56
|
+
content = Spatial.get_text_content section
|
57
|
+
Spatial.drop_spatial(section).merge({
|
58
|
+
:letter_ratio => Language.letter_ratio(content),
|
59
|
+
:year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
|
60
|
+
:name_ratio => Language.name_ratio(content),
|
61
|
+
:word_count => Language.word_count(content)
|
62
|
+
})
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.include_in pdf
|
67
|
+
pdf.spatials :sections, :depends_on => [:regions, :columns] do |parser|
|
68
|
+
|
69
|
+
columns = []
|
70
|
+
|
71
|
+
parser.objects :columns do |column|
|
72
|
+
columns << {:column => column, :regions => []}
|
73
|
+
end
|
74
|
+
|
75
|
+
parser.objects :regions do |region|
|
76
|
+
containers = columns.reject do |c|
|
77
|
+
column = c[:column]
|
78
|
+
not (column[:page] == region[:page] && Spatial.contains?(column, region))
|
79
|
+
end
|
80
|
+
|
81
|
+
containers.first[:regions] << region unless containers.count.zero?
|
82
|
+
end
|
83
|
+
|
84
|
+
parser.after do
|
85
|
+
# Sort regions in each column from highest to lowest.
|
86
|
+
columns.each do |c|
|
87
|
+
c[:regions].sort_by! { |r| -r[:y] }
|
88
|
+
end
|
89
|
+
|
90
|
+
# Group columns into pages.
|
91
|
+
pages = {}
|
92
|
+
columns.each do |c|
|
93
|
+
pages[c[:column][:page]] ||= []
|
94
|
+
pages[c[:column][:page]] << c
|
95
|
+
end
|
96
|
+
|
97
|
+
# Sort bodies on each page from x left to right.
|
98
|
+
pages.each_pair do |page, columns|
|
99
|
+
columns.sort_by! { |c| c[:column][:x] }
|
100
|
+
end
|
101
|
+
|
102
|
+
sections = []
|
103
|
+
found = []
|
104
|
+
|
105
|
+
pages.each_pair do |page, columns|
|
106
|
+
columns.each do |c|
|
107
|
+
column = c[:column]
|
108
|
+
|
109
|
+
c[:regions].each do |region|
|
110
|
+
|
111
|
+
if candidate? pdf, region, column
|
112
|
+
if !found.last.nil? && match?(found.last, region)
|
113
|
+
content = Spatial.merge_lines(found.last, region, {})
|
114
|
+
found.last.merge!(content)
|
115
|
+
else
|
116
|
+
found << region
|
117
|
+
end
|
118
|
+
else
|
119
|
+
sections = sections + found
|
120
|
+
found = []
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
sections = sections + found
|
128
|
+
|
129
|
+
# We now have sections. Add information to them.
|
130
|
+
# add_content_types sections
|
131
|
+
sections = add_content_stats sections
|
132
|
+
|
133
|
+
# Score sections into categories based on their textual attributes.
|
134
|
+
ideals = {
|
135
|
+
:reference => {
|
136
|
+
:name_ratio => [0.2, 5],
|
137
|
+
:letter_ratio => [0.25, 2],
|
138
|
+
:year_ratio => [0.05, 7]
|
139
|
+
},
|
140
|
+
:body => {
|
141
|
+
:name_ratio => [0.03, 1],
|
142
|
+
:letter_ratio => [0.1, 1],
|
143
|
+
:year_ratio => [0.0, 1]
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
Spatial.score(sections, ideals)
|
148
|
+
|
149
|
+
sections
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative "../spatial"
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module Titles
|
5
|
+
|
6
|
+
Settings.default :title_slop, 0.2
|
7
|
+
|
8
|
+
def self.include_in pdf
|
9
|
+
pdf.spatials :titles, :depends_on => [:regions] do |parser|
|
10
|
+
titles = []
|
11
|
+
|
12
|
+
parser.objects :regions do |region|
|
13
|
+
titles << region
|
14
|
+
end
|
15
|
+
|
16
|
+
parser.after do
|
17
|
+
# A title should:
|
18
|
+
# be longer than one letter,
|
19
|
+
titles.reject! { |r| Spatial.get_text_content(r).strip.length < 2}
|
20
|
+
|
21
|
+
# be in the top half of a page,
|
22
|
+
titles.reject! { |r| r[:y] < (r[:page_height] / 2.0) }
|
23
|
+
|
24
|
+
# be no less tall than a factor of the tallest text,
|
25
|
+
titles.sort_by! { |r| -r[:line_height] }
|
26
|
+
tallest_line = titles.first[:line_height]
|
27
|
+
title_slop = tallest_line - (tallest_line * pdf.settings[:title_slop])
|
28
|
+
titles.reject! { |r| r[:line_height] < title_slop }
|
29
|
+
|
30
|
+
# be on the earliest page with text,
|
31
|
+
titles.sort_by! { |r| r[:page] }
|
32
|
+
first_page = titles.first[:page]
|
33
|
+
titles.reject! { |r| r[:page] != first_page }
|
34
|
+
|
35
|
+
# be the highest of the above.
|
36
|
+
titles.sort_by! { |r| -r[:y] }
|
37
|
+
|
38
|
+
if titles.count.zero?
|
39
|
+
[]
|
40
|
+
else
|
41
|
+
{
|
42
|
+
:content => Spatial.get_text_content(titles.first),
|
43
|
+
:line_height => titles.first[:line_height],
|
44
|
+
:font => titles.first[:font]
|
45
|
+
}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|