pdf-extract 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/pdf-extract +14 -4
- data/lib/analysis/columns.rb +11 -2
- data/lib/analysis/sections.rb +5 -1
- data/lib/analysis/titles.rb +5 -1
- data/lib/analysis/zones.rb +5 -2
- data/lib/model/chunks.rb +17 -6
- data/lib/model/regions.rb +6 -1
- data/lib/pdf.rb +9 -3
- data/lib/references/references.rb +23 -4
- metadata +3 -3
data/bin/pdf-extract
CHANGED
@@ -129,6 +129,10 @@ commands.each do |cmd|
|
|
129
129
|
end
|
130
130
|
end
|
131
131
|
|
132
|
+
def norm_name k
|
133
|
+
k.sub /.+\:\:/, ""
|
134
|
+
end
|
135
|
+
|
132
136
|
command :settings do |c|
|
133
137
|
c.syntax = "pdf-extract settings [options]"
|
134
138
|
c.description = "Print settings that pdf-extract will use to screen."
|
@@ -137,10 +141,16 @@ command :settings do |c|
|
|
137
141
|
pdf = PdfExtract::Pdf.new
|
138
142
|
apply_settings pdf
|
139
143
|
s = pdf.settings
|
140
|
-
|
141
|
-
s.
|
142
|
-
|
143
|
-
|
144
|
+
|
145
|
+
if s.modified.empty?
|
146
|
+
s.unmodified.each_pair { |k, v| say "====================\n<%= color('#{norm_name(v[:module])}', BOLD) %> declares <%= color('#{k}', BOLD) %>, default = <%= color('#{v[:default]}', BOLD) %>\n\n#{v[:description]}\n====================\n\n" }
|
147
|
+
else
|
148
|
+
say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
|
149
|
+
s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
|
150
|
+
|
151
|
+
say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
|
152
|
+
s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
|
153
|
+
end
|
144
154
|
end
|
145
155
|
end
|
146
156
|
|
data/lib/analysis/columns.rb
CHANGED
@@ -1,8 +1,17 @@
|
|
1
1
|
module PdfExtract
|
2
2
|
module Columns
|
3
3
|
|
4
|
-
Settings.
|
5
|
-
|
4
|
+
Settings.declare :column_sample_count, {
|
5
|
+
:default => 8,
|
6
|
+
:module => self.name,
|
7
|
+
:description => "Columns are detected by sampling :column_sample_count lines across a page and examing the number of regions incident with each line."
|
8
|
+
}
|
9
|
+
|
10
|
+
Settings.declare :max_column_count, {
|
11
|
+
:default => 3,
|
12
|
+
:module => self.name,
|
13
|
+
:description => "The maximum number of columns that can ever occur. During column detection column counts larger than :max_column_count will be disregarded."
|
14
|
+
}
|
6
15
|
|
7
16
|
def self.columns_at y, body_regions
|
8
17
|
x_mask = MultiRange.new
|
data/lib/analysis/sections.rb
CHANGED
@@ -5,7 +5,11 @@ require_relative '../kmeans'
|
|
5
5
|
module PdfExtract
|
6
6
|
module Sections
|
7
7
|
|
8
|
-
Settings.
|
8
|
+
Settings.declare :width_ratio, {
|
9
|
+
:default => 0.9,
|
10
|
+
:module => self.name,
|
11
|
+
:description => "Minimum ratio of text region width to containing column width for a text region to be considered as part of an article section."
|
12
|
+
}
|
9
13
|
|
10
14
|
def self.match? a, b
|
11
15
|
lh = a[:line_height].round(2) == b[:line_height].round(2)
|
data/lib/analysis/titles.rb
CHANGED
@@ -3,7 +3,11 @@ require_relative "../spatial"
|
|
3
3
|
module PdfExtract
|
4
4
|
module Titles
|
5
5
|
|
6
|
-
Settings.
|
6
|
+
Settings.declare :title_slop, {
|
7
|
+
:default => 0.2,
|
8
|
+
:module => self.name,
|
9
|
+
:description => "Regions of text whose font size is less than :title_slop percent of the largest font size in a PDF will be disregarded as candidate titles. Value must be 0 - 1."
|
10
|
+
}
|
7
11
|
|
8
12
|
def self.include_in pdf
|
9
13
|
pdf.spatials :titles, :depends_on => [:regions] do |parser|
|
data/lib/analysis/zones.rb
CHANGED
@@ -5,8 +5,11 @@ module PdfExtract
|
|
5
5
|
# distance from margins. Should be within a factor of the body
|
6
6
|
# area.
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
Settings.declare :body_ratio, {
|
9
|
+
:default => 0.9,
|
10
|
+
:module => "Bodies, Headers, Footers",
|
11
|
+
:description => "Minium permitted ratio of page height to candidate body zone height. When detecting headers, footers and body (area between header and footer) zones, candidate header and footer areas will be disregarded if they imply a body area whose height to page height ratio is less than :body_ratio."
|
12
|
+
}
|
10
13
|
|
11
14
|
def self.include_in pdf
|
12
15
|
deps = [:top_margins, :left_margins, :right_margins, :bottom_margins, :regions]
|
data/lib/model/chunks.rb
CHANGED
@@ -5,14 +5,25 @@ module PdfExtract
|
|
5
5
|
|
6
6
|
# TODO Look for obj[:writing_mode] == :vertical or :horizontal
|
7
7
|
|
8
|
-
Settings.
|
9
|
-
|
10
|
-
|
8
|
+
Settings.declare :char_slop, {
|
9
|
+
:default => 0.2,
|
10
|
+
:module => self.name,
|
11
|
+
:description => "Maximum allowed space between characters for them to be considered part of the same word. char_slop is multiplied by the width of each character to find its joining width."
|
12
|
+
}
|
13
|
+
|
14
|
+
Settings.declare :word_slop, {
|
15
|
+
:default => 4.0,
|
16
|
+
:module => self.name,
|
17
|
+
:description => "Maximum allowed space between words for them to be considered part of the same line. word_slop is multiplied by width of the last character in a word to find its joining width."
|
18
|
+
}
|
19
|
+
|
20
|
+
Settings.declare :overlap_slop, {
|
21
|
+
:default => 0.9,
|
22
|
+
:module => self.name,
|
23
|
+
:description => "A minimum permitted ratio of the overlapped height of words for them to join together into lines."
|
24
|
+
}
|
11
25
|
|
12
26
|
def self.include_in pdf
|
13
|
-
char_slop = 0.2
|
14
|
-
word_slop = 4.0
|
15
|
-
overlap_slop = 0.9
|
16
27
|
|
17
28
|
pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
|
18
29
|
rows = {}
|
data/lib/model/regions.rb
CHANGED
@@ -3,7 +3,12 @@ require_relative '../spatial'
|
|
3
3
|
module PdfExtract
|
4
4
|
module Regions
|
5
5
|
|
6
|
-
Settings.
|
6
|
+
Settings.declare :line_slop, {
|
7
|
+
:default => 1.0,
|
8
|
+
:module => self.name,
|
9
|
+
:description => "Maximum allowed line spacing between lines that are considered
|
10
|
+
to be part of the same region. :line_slop is multiplied by the average line height of a region to find a maximum line spacing between a region and a candidate line."
|
11
|
+
}
|
7
12
|
|
8
13
|
# TODO Handle :writing_mode once present in characters and text_chunks.
|
9
14
|
|
data/lib/pdf.rb
CHANGED
@@ -6,8 +6,13 @@ module PdfExtract
|
|
6
6
|
|
7
7
|
@@defaults = {}
|
8
8
|
|
9
|
-
def self.
|
10
|
-
|
9
|
+
def self.declare key, opts={}
|
10
|
+
default_hash = {
|
11
|
+
:default => "",
|
12
|
+
:description => "",
|
13
|
+
:module => ""
|
14
|
+
}.merge(opts)
|
15
|
+
@@defaults[key] = default_hash
|
11
16
|
end
|
12
17
|
|
13
18
|
def initialize
|
@@ -16,7 +21,8 @@ module PdfExtract
|
|
16
21
|
end
|
17
22
|
|
18
23
|
def [] key
|
19
|
-
@settings[key] ||
|
24
|
+
@settings[key] ||
|
25
|
+
(@@defaults[key] && @@defaults[key][:default]) ||
|
20
26
|
raise("Attempt to use undeclared setting \"#{key}\"")
|
21
27
|
end
|
22
28
|
|
@@ -5,10 +5,29 @@ require_relative "score"
|
|
5
5
|
module PdfExtract
|
6
6
|
module References
|
7
7
|
|
8
|
-
Settings.
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
Settings.declare :reference_flex, {
|
9
|
+
:default => 0.1,
|
10
|
+
:module => self.name,
|
11
|
+
:description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
|
12
|
+
}
|
13
|
+
|
14
|
+
Settings.declare :min_sequence_count, {
|
15
|
+
:default => 3,
|
16
|
+
:module => self.name,
|
17
|
+
:description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
|
18
|
+
}
|
19
|
+
|
20
|
+
Settings.declare :max_reference_order, {
|
21
|
+
:default => 1000,
|
22
|
+
:module => self.name,
|
23
|
+
:description => "References whose number would be greater than :max_reference_order are ignored. This helps avoid confusing year literals with reference numbers."
|
24
|
+
}
|
25
|
+
|
26
|
+
Settings.declare :min_lateness, {
|
27
|
+
:default => 0.5,
|
28
|
+
:module => self.name,
|
29
|
+
:description => "Article sections that appear early in an article will not be considered as candidate reference sections. :min_lateness is a value from 0 to 1, where 0 represents the start of an article, 1 the end."
|
30
|
+
}
|
12
31
|
|
13
32
|
def self.partition_by ary, &block
|
14
33
|
matching = []
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 8
|
9
|
+
version: 0.0.8
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Karl Jonathan Ward
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-11-
|
17
|
+
date: 2011-11-10 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|