pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/pdf-extract ADDED
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'commander/import'
4
+ require 'json'
5
+ require_relative '../lib/pdf-extract'
6
+ require_relative '../lib/references/resolve'
7
+
8
+ program :name, 'pdf-extract'
9
+ program :version, '0.0.1'
10
+ program :description, 'PDF content extraction toolkit'
11
+
12
+ semantic = ['resolved_references', 'references', 'titles', 'sections']
13
+ margins = ['top_margins', 'bottom_margins', 'left_margins', 'right_margins']
14
+ zones = ['headers', 'footers', 'bodies']
15
+ objects = ['characters', 'chunks', 'regions', 'columns'] + semantic + margins + zones
16
+
17
+ resolvers = {
18
+ "sigg" => PdfExtract::Resolve::Sigg,
19
+ "freecite" => PdfExtract::Resolve::FreeCite,
20
+ "stq" => PdfExtract::Resolve::SimpleTextQuery
21
+ }
22
+
23
+ outputs = {
24
+ :xml => proc { :stdout },
25
+ :pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" },
26
+ :png => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.png" }
27
+ }
28
+
29
+ commands = [
30
+ {
31
+ :name => "extract",
32
+ :view => :xml,
33
+ :description => "Extract objects as XML."
34
+ },
35
+ {
36
+ :name => "mark",
37
+ :view => :pdf,
38
+ :description => "Highlight bounding boxes of objects in a PDF."
39
+ },
40
+ {
41
+ :name => "annotate",
42
+ :view => :not_implemented,
43
+ :description => "Annotate a PDF with attributes of extracted objects."
44
+ }
45
+ ]
46
+
47
+ $chosen_objects = []
48
+
49
+ $render_options = {}
50
+
51
+ $overrides = {}
52
+
53
+ objects.each do |o|
54
+ global_option "--#{o}" do |_|
55
+ $chosen_objects << o
56
+ end
57
+ end
58
+
59
+ global_option "--semantic" do |_| $chosen_objects += semantic end
60
+ global_option "--margins" do |_| $chosen_objects += margins end
61
+ global_option "--zones" do |_| $chosen_objects += zones end
62
+
63
+ global_option "--resolvers RESOLVERS" do |chosen_resolvers|
64
+ chosen_resolvers = chosen_resolvers.split ","
65
+ chosen_resolvers.each do |name|
66
+ fail "No such resolver #{resolver}" unless resolvers.key? name
67
+ end
68
+ chosen_resolvers.map! { |name| resolvers[name] }
69
+ PdfExtract::Resolve.resolvers = chosen_resolvers
70
+ end
71
+
72
+ global_option "--output FILE" do |filename|
73
+ $output = filename
74
+ end
75
+
76
+ global_option "--no-lines" do |_|
77
+ $render_options[:lines] = false
78
+ end
79
+
80
+ global_option "--precision DIGITS" do |digits|
81
+ $render_options[:round] = digits.to_i
82
+ end
83
+
84
+ global_option "--outline" do |_|
85
+ $render_options[:outline] = true
86
+ end
87
+
88
+ global_option "--set SETTING:VALUE" do |s|
89
+ (name, value) = s.split ":"
90
+ $overrides[name] = value
91
+ end
92
+
93
+ global_option "--config CONFIG_FILE" do |filename|
94
+ $config = filename
95
+ end
96
+
97
+ def self.apply_settings pdf
98
+ if not $config.nil?
99
+ conf = JSON.parse File.open($config, "r").read
100
+ conf.each_pair do |setting, value|
101
+ pdf.set setting.to_sym, value, $config
102
+ end
103
+ end
104
+ $overrides.each_pair { |k,v| pdf.set k.to_sym, v, "command line" }
105
+ end
106
+
107
+ commands.each do |cmd|
108
+ command cmd[:name].to_sym do |c|
109
+ c.syntax = "pdf-extract #{cmd[:name]} [options] filename"
110
+ c.description = cmd[:description]
111
+
112
+ c.action do |args, options|
113
+ args.each do |filename|
114
+ $output = outputs[cmd[:view]].call(filename) if $output.nil?
115
+ opts = {:as => cmd[:view]}.merge $render_options
116
+ out = PdfExtract.view filename, opts do |pdf|
117
+ apply_settings pdf
118
+ $chosen_objects.each { |name| pdf.send name.to_sym }
119
+ end
120
+
121
+ if $output == :stdout
122
+ say out
123
+ else
124
+ PdfExtract.view_class(cmd[:view]).write(out, $output)
125
+ end
126
+ end
127
+ end
128
+
129
+ end
130
+ end
131
+
132
+ command :settings do |c|
133
+ c.syntax = "pdf-extract settings [options]"
134
+ c.description = "Print settings that pdf-extract will use to screen."
135
+
136
+ c.action do |args, options|
137
+ pdf = PdfExtract::Pdf.new
138
+ apply_settings pdf
139
+ s = pdf.settings
140
+ say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
141
+ s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
142
+ say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
143
+ s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
144
+ end
145
+ end
146
+
File without changes
Binary file
Binary file
Binary file
data/bin/train.rb ADDED
@@ -0,0 +1,48 @@
1
+ # Train ideal attributes based on example input.
2
+
3
+ require_relative "../lib/language"
4
+
5
+ variables = {
6
+ :name_ratio => method(PdfExtract::Language::name_ratio),
7
+ :letter_ratio => method(PdfExtract::Language::letter_ratio),
8
+ :year_ratio => method(PdfExtract::Language::year_ratio)
9
+ }
10
+
11
+ results = {}
12
+ sums = {}
13
+ variables.each_pair do |k, _|
14
+ sums[k] = 0
15
+ results[k] = []
16
+ end
17
+
18
+ count = 0
19
+
20
+ File.open(ARGV[0]).read.lines.each do |line|
21
+ variables.each_pair do |var, fn|
22
+ val = fn.call(line)
23
+ results[var] << val
24
+ sums[var] = val
25
+ end
26
+
27
+ count = count.next
28
+ end
29
+
30
+ avgs = {}
31
+ sums.each_pair { |k, _| avgs[k] = sums[k] / count }
32
+
33
+ deviations = {}
34
+ results.each_pair do |name, vals|
35
+ deviations[name] = results[name].map { |val| (args[name - val]) ** 2 }
36
+ end
37
+
38
+ std_deviations = {}
39
+ deviations.each_pair do |name, vals|
40
+ sum = 0
41
+ vals.each { |val| sum += val }
42
+ std_deviations[name] = (sum / (count - 1).to_f).sqrt
43
+ end
44
+
45
+ puts "Averages"
46
+ puts avgs
47
+ puts "Standard deviations"
48
+ puts std_deviations
Binary file
Binary file
@@ -0,0 +1 @@
1
+ a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,association,company,org,organisation,president,vice,nobel,prize,medicine,biology,physics,chemistry,laboratories,labs
@@ -0,0 +1,75 @@
1
+ module PdfExtract
2
+ module Columns
3
+
4
+ Settings.default :column_sample_count, 8
5
+ Settings.default :max_column_count, 3
6
+
7
+ def self.columns_at y, body_regions
8
+ x_mask = MultiRange.new
9
+
10
+ body_regions.each do |region|
11
+ if region[:y] <= y && (region[:y] + region[:height]) >= y
12
+ x_mask.append(region[:x] .. (region[:x] + region[:width]))
13
+ end
14
+ end
15
+
16
+ x_mask
17
+ end
18
+
19
+ def self.include_in pdf
20
+ deps = [:regions, :bodies]
21
+ pdf.spatials :columns, :paged => true, :depends_on => deps do |parser|
22
+
23
+ body = nil
24
+ body_regions = []
25
+
26
+ parser.before do
27
+ body_regions = []
28
+ end
29
+
30
+ parser.objects :bodies do |b|
31
+ body = b
32
+ end
33
+
34
+ parser.objects :regions do |region|
35
+ if Spatial.contains? body, region
36
+ body_regions << region
37
+ end
38
+ end
39
+
40
+ parser.after do
41
+ column_sample_count = pdf.settings[:column_sample_count]
42
+
43
+ step = 1.0 / (column_sample_count + 1)
44
+ column_ranges = []
45
+
46
+ (1 .. column_sample_count).each do |i|
47
+ y = body[:y] + (body[:height] * i * step)
48
+ column_ranges << columns_at(y, body_regions)
49
+ end
50
+
51
+ # Discard those with more than x columns. They've probably hit a table.
52
+ column_ranges.reject! { |r| r.count > pdf.settings[:max_column_count] }
53
+
54
+ if column_ranges.count.zero?
55
+ []
56
+ else
57
+ # Find the highest column count.
58
+ most = column_ranges.max_by { |r| r.count }.count
59
+ column_ranges.reject! { |r| r.count != most }
60
+
61
+ # Take the columns that are widest.
62
+ widest = column_ranges.map { |r| r.avg }.max
63
+ column_ranges.reject! { |r| r.avg < widest }
64
+
65
+ column_ranges.first.ranges.map do |range|
66
+ body.merge({:x => range.min, :width => range.max - range.min })
67
+ end
68
+ end
69
+ end
70
+
71
+ end
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,84 @@
1
+ require_relative '../multi_range'
2
+
3
+ module PdfExtract
4
+ module Margins
5
+
6
+ def self.axis_spatials pdf, name, axis
7
+ pdf.spatials name, :paged => true, :depends_on => [:regions] do |parser|
8
+ axis_mask = MultiRange.new
9
+ page = -1
10
+ page_width = 0
11
+ page_height = 0
12
+
13
+ dimension = :width if axis == :x
14
+ dimension = :height if axis == :y
15
+
16
+ parser.before do
17
+ axis_mask = MultiRange.new
18
+ page = -1
19
+ end
20
+
21
+ parser.objects :regions do |region|
22
+ if page == -1
23
+ page = region[:page]
24
+ page_width = region[:page_width]
25
+ page_height = region[:page_height]
26
+ end
27
+
28
+ axis_mask.append region[axis]..(region[axis]+region[dimension])
29
+ end
30
+
31
+ parser.after do
32
+ if axis_mask.count.zero?
33
+ nil
34
+ else
35
+ yield axis_mask, {
36
+ :page => page,
37
+ :page_width => page_width,
38
+ :page_height => page_height
39
+ }
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def self.include_in pdf
46
+ axis_spatials pdf, :top_margins, :y do |y_mask, obj|
47
+ obj.merge({
48
+ :x => 0,
49
+ :y => y_mask.max,
50
+ :width => obj[:page_width],
51
+ :height => obj[:page_height] - y_mask.max
52
+ })
53
+ end
54
+
55
+ axis_spatials pdf, :bottom_margins, :y do |y_mask, obj|
56
+ obj.merge({
57
+ :x => 0,
58
+ :y => 0,
59
+ :width => obj[:page_width],
60
+ :height => y_mask.min
61
+ })
62
+ end
63
+
64
+ axis_spatials pdf, :left_margins, :x do |x_mask, obj|
65
+ obj.merge({
66
+ :x => 0,
67
+ :y => 0,
68
+ :width => x_mask.min,
69
+ :height => obj[:page_height]
70
+ })
71
+ end
72
+
73
+ axis_spatials pdf, :right_margins, :x do |x_mask, obj|
74
+ obj.merge({
75
+ :x => x_mask.max,
76
+ :y => 0,
77
+ :width => obj[:page_width] - x_mask.max,
78
+ :height => obj[:page_height]
79
+ })
80
+ end
81
+ end
82
+
83
+ end
84
+ end
@@ -0,0 +1,156 @@
1
+ require_relative '../language'
2
+ require_relative '../spatial'
3
+ require_relative '../kmeans'
4
+
5
+ module PdfExtract
6
+ module Sections
7
+
8
+ Settings.default :width_ratio, 0.9
9
+
10
+ def self.match? a, b
11
+ lh = a[:line_height].round(2) == b[:line_height].round(2)
12
+ f = a[:font] == b[:font]
13
+ lh && f
14
+ end
15
+
16
+ def self.candidate? pdf, region, column
17
+ # Regions that make up sections or headers must be
18
+ # both less width than their column width and,
19
+ # unless they are a single line, must be within the
20
+ # width_ratio.
21
+ width_ratio = pdf.settings[:width_ratio]
22
+ within_column = region[:width] <= column[:width]
23
+ within_column && (region[:width].to_f / column[:width]) >= width_ratio
24
+ end
25
+
26
+ def self.reference_cluster clusters
27
+ # Find the cluster with name_ratio closest to 0.1
28
+ # Those are our reference sections.
29
+ ideal = 0.1
30
+ ref_cluster = nil
31
+ smallest_diff = 1
32
+
33
+ clusters.each do |cluster|
34
+ diff = (cluster[:centre][:name_ratio] - ideal).abs
35
+ if diff < smallest_diff
36
+ ref_cluster = cluster
37
+ smallest_diff = diff
38
+ end
39
+ end
40
+
41
+ ref_cluster
42
+ end
43
+
44
+ def self.clusters_to_spatials clusters
45
+ clusters.map do |cluster|
46
+ cluster[:items].each do |item|
47
+ centre = cluster[:centre].values.map { |v| v.round(3) }.join ", "
48
+ item[:centre] = centre
49
+ end
50
+ cluster[:items]
51
+ end.flatten
52
+ end
53
+
54
+ def self.add_content_stats sections
55
+ sections.map do |section|
56
+ content = Spatial.get_text_content section
57
+ Spatial.drop_spatial(section).merge({
58
+ :letter_ratio => Language.letter_ratio(content),
59
+ :year_ratio => Language.year_ratio(content), :cap_ratio => Language.cap_ratio(content),
60
+ :name_ratio => Language.name_ratio(content),
61
+ :word_count => Language.word_count(content)
62
+ })
63
+ end
64
+ end
65
+
66
+ def self.include_in pdf
67
+ pdf.spatials :sections, :depends_on => [:regions, :columns] do |parser|
68
+
69
+ columns = []
70
+
71
+ parser.objects :columns do |column|
72
+ columns << {:column => column, :regions => []}
73
+ end
74
+
75
+ parser.objects :regions do |region|
76
+ containers = columns.reject do |c|
77
+ column = c[:column]
78
+ not (column[:page] == region[:page] && Spatial.contains?(column, region))
79
+ end
80
+
81
+ containers.first[:regions] << region unless containers.count.zero?
82
+ end
83
+
84
+ parser.after do
85
+ # Sort regions in each column from highest to lowest.
86
+ columns.each do |c|
87
+ c[:regions].sort_by! { |r| -r[:y] }
88
+ end
89
+
90
+ # Group columns into pages.
91
+ pages = {}
92
+ columns.each do |c|
93
+ pages[c[:column][:page]] ||= []
94
+ pages[c[:column][:page]] << c
95
+ end
96
+
97
+ # Sort bodies on each page from x left to right.
98
+ pages.each_pair do |page, columns|
99
+ columns.sort_by! { |c| c[:column][:x] }
100
+ end
101
+
102
+ sections = []
103
+ found = []
104
+
105
+ pages.each_pair do |page, columns|
106
+ columns.each do |c|
107
+ column = c[:column]
108
+
109
+ c[:regions].each do |region|
110
+
111
+ if candidate? pdf, region, column
112
+ if !found.last.nil? && match?(found.last, region)
113
+ content = Spatial.merge_lines(found.last, region, {})
114
+ found.last.merge!(content)
115
+ else
116
+ found << region
117
+ end
118
+ else
119
+ sections = sections + found
120
+ found = []
121
+ end
122
+
123
+ end
124
+ end
125
+ end
126
+
127
+ sections = sections + found
128
+
129
+ # We now have sections. Add information to them.
130
+ # add_content_types sections
131
+ sections = add_content_stats sections
132
+
133
+ # Score sections into categories based on their textual attributes.
134
+ ideals = {
135
+ :reference => {
136
+ :name_ratio => [0.2, 5],
137
+ :letter_ratio => [0.25, 2],
138
+ :year_ratio => [0.05, 7]
139
+ },
140
+ :body => {
141
+ :name_ratio => [0.03, 1],
142
+ :letter_ratio => [0.1, 1],
143
+ :year_ratio => [0.0, 1]
144
+ }
145
+ }
146
+
147
+ Spatial.score(sections, ideals)
148
+
149
+ sections
150
+ end
151
+
152
+ end
153
+ end
154
+
155
+ end
156
+ end
@@ -0,0 +1,53 @@
1
+ require_relative "../spatial"
2
+
3
+ module PdfExtract
4
+ module Titles
5
+
6
+ Settings.default :title_slop, 0.2
7
+
8
+ def self.include_in pdf
9
+ pdf.spatials :titles, :depends_on => [:regions] do |parser|
10
+ titles = []
11
+
12
+ parser.objects :regions do |region|
13
+ titles << region
14
+ end
15
+
16
+ parser.after do
17
+ # A title should:
18
+ # be longer than one letter,
19
+ titles.reject! { |r| Spatial.get_text_content(r).strip.length < 2}
20
+
21
+ # be in the top half of a page,
22
+ titles.reject! { |r| r[:y] < (r[:page_height] / 2.0) }
23
+
24
+ # be no less tall than a factor of the tallest text,
25
+ titles.sort_by! { |r| -r[:line_height] }
26
+ tallest_line = titles.first[:line_height]
27
+ title_slop = tallest_line - (tallest_line * pdf.settings[:title_slop])
28
+ titles.reject! { |r| r[:line_height] < title_slop }
29
+
30
+ # be on the earliest page with text,
31
+ titles.sort_by! { |r| r[:page] }
32
+ first_page = titles.first[:page]
33
+ titles.reject! { |r| r[:page] != first_page }
34
+
35
+ # be the highest of the above.
36
+ titles.sort_by! { |r| -r[:y] }
37
+
38
+ if titles.count.zero?
39
+ []
40
+ else
41
+ {
42
+ :content => Spatial.get_text_content(titles.first),
43
+ :line_height => titles.first[:line_height],
44
+ :font => titles.first[:font]
45
+ }
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ end
52
+ end
53
+