pdf-extract 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -129,6 +129,10 @@ commands.each do |cmd|
129
129
  end
130
130
  end
131
131
 
132
+ def norm_name k
133
+ k.sub /.+\:\:/, ""
134
+ end
135
+
132
136
  command :settings do |c|
133
137
  c.syntax = "pdf-extract settings [options]"
134
138
  c.description = "Print settings that pdf-extract will use to screen."
@@ -137,10 +141,16 @@ command :settings do |c|
137
141
  pdf = PdfExtract::Pdf.new
138
142
  apply_settings pdf
139
143
  s = pdf.settings
140
- say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
141
- s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
142
- say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
143
- s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
144
+
145
+ if s.modified.empty?
146
+ s.unmodified.each_pair { |k, v| say "====================\n<%= color('#{norm_name(v[:module])}', BOLD) %> declares <%= color('#{k}', BOLD) %>, default = <%= color('#{v[:default]}', BOLD) %>\n\n#{v[:description]}\n====================\n\n" }
147
+ else
148
+ say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
149
+ s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
150
+
151
+ say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
152
+ s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
153
+ end
144
154
  end
145
155
  end
146
156
 
@@ -1,8 +1,17 @@
1
1
  module PdfExtract
2
2
  module Columns
3
3
 
4
- Settings.default :column_sample_count, 8
5
- Settings.default :max_column_count, 3
4
+ Settings.declare :column_sample_count, {
5
+ :default => 8,
6
+ :module => self.name,
7
+ :description => "Columns are detected by sampling :column_sample_count lines across a page and examing the number of regions incident with each line."
8
+ }
9
+
10
+ Settings.declare :max_column_count, {
11
+ :default => 3,
12
+ :module => self.name,
13
+ :description => "The maximum number of columns that can ever occur. During column detection column counts larger than :max_column_count will be disregarded."
14
+ }
6
15
 
7
16
  def self.columns_at y, body_regions
8
17
  x_mask = MultiRange.new
@@ -5,7 +5,11 @@ require_relative '../kmeans'
5
5
  module PdfExtract
6
6
  module Sections
7
7
 
8
- Settings.default :width_ratio, 0.9
8
+ Settings.declare :width_ratio, {
9
+ :default => 0.9,
10
+ :module => self.name,
11
+ :description => "Minimum ratio of text region width to containing column width for a text region to be considered as part of an article section."
12
+ }
9
13
 
10
14
  def self.match? a, b
11
15
  lh = a[:line_height].round(2) == b[:line_height].round(2)
@@ -3,7 +3,11 @@ require_relative "../spatial"
3
3
  module PdfExtract
4
4
  module Titles
5
5
 
6
- Settings.default :title_slop, 0.2
6
+ Settings.declare :title_slop, {
7
+ :default => 0.2,
8
+ :module => self.name,
9
+ :description => "Regions of text whose font size is less than :title_slop percent of the largest font size in a PDF will be disregarded as candidate titles. Value must be 0 - 1."
10
+ }
7
11
 
8
12
  def self.include_in pdf
9
13
  pdf.spatials :titles, :depends_on => [:regions] do |parser|
@@ -5,8 +5,11 @@ module PdfExtract
5
5
  # distance from margins. Should be within a factor of the body
6
6
  # area.
7
7
 
8
- # Ratio of marginless page height to minimum body height.
9
- Settings.default :body_ratio, 0.9
8
+ Settings.declare :body_ratio, {
9
+ :default => 0.9,
10
+ :module => "Bodies, Headers, Footers",
11
+ :description => "Minium permitted ratio of page height to candidate body zone height. When detecting headers, footers and body (area between header and footer) zones, candidate header and footer areas will be disregarded if they imply a body area whose height to page height ratio is less than :body_ratio."
12
+ }
10
13
 
11
14
  def self.include_in pdf
12
15
  deps = [:top_margins, :left_margins, :right_margins, :bottom_margins, :regions]
@@ -5,14 +5,25 @@ module PdfExtract
5
5
 
6
6
  # TODO Look for obj[:writing_mode] == :vertical or :horizontal
7
7
 
8
- Settings.default :char_slop, 0.2
9
- Settings.default :word_slop, 4.0
10
- Settings.default :overlap_slop, 0.9
8
+ Settings.declare :char_slop, {
9
+ :default => 0.2,
10
+ :module => self.name,
11
+ :description => "Maximum allowed space between characters for them to be considered part of the same word. char_slop is multiplied by the width of each character to find its joining width."
12
+ }
13
+
14
+ Settings.declare :word_slop, {
15
+ :default => 4.0,
16
+ :module => self.name,
17
+ :description => "Maximum allowed space between words for them to be considered part of the same line. word_slop is multiplied by width of the last character in a word to find its joining width."
18
+ }
19
+
20
+ Settings.declare :overlap_slop, {
21
+ :default => 0.9,
22
+ :module => self.name,
23
+ :description => "A minimum permitted ratio of the overlapped height of words for them to join together into lines."
24
+ }
11
25
 
12
26
  def self.include_in pdf
13
- char_slop = 0.2
14
- word_slop = 4.0
15
- overlap_slop = 0.9
16
27
 
17
28
  pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
18
29
  rows = {}
@@ -3,7 +3,12 @@ require_relative '../spatial'
3
3
  module PdfExtract
4
4
  module Regions
5
5
 
6
- Settings.default :line_slop, 1.0
6
+ Settings.declare :line_slop, {
7
+ :default => 1.0,
8
+ :module => self.name,
9
+ :description => "Maximum allowed line spacing between lines that are considered
10
+ to be part of the same region. :line_slop is multiplied by the average line height of a region to find a maximum line spacing between a region and a candidate line."
11
+ }
7
12
 
8
13
  # TODO Handle :writing_mode once present in characters and text_chunks.
9
14
 
data/lib/pdf.rb CHANGED
@@ -6,8 +6,13 @@ module PdfExtract
6
6
 
7
7
  @@defaults = {}
8
8
 
9
- def self.default key, default_value
10
- @@defaults[key] = default_value
9
+ def self.declare key, opts={}
10
+ default_hash = {
11
+ :default => "",
12
+ :description => "",
13
+ :module => ""
14
+ }.merge(opts)
15
+ @@defaults[key] = default_hash
11
16
  end
12
17
 
13
18
  def initialize
@@ -16,7 +21,8 @@ module PdfExtract
16
21
  end
17
22
 
18
23
  def [] key
19
- @settings[key] || @@defaults[key] ||
24
+ @settings[key] ||
25
+ (@@defaults[key] && @@defaults[key][:default]) ||
20
26
  raise("Attempt to use undeclared setting \"#{key}\"")
21
27
  end
22
28
 
@@ -5,10 +5,29 @@ require_relative "score"
5
5
  module PdfExtract
6
6
  module References
7
7
 
8
- Settings.default :reference_flex, 0.1
9
- Settings.default :min_sequence_count, 3
10
- Settings.default :max_reference_order, 1000
11
- Settings.default :min_lateness , 0.5
8
+ Settings.declare :reference_flex, {
9
+ :default => 0.1,
10
+ :module => self.name,
11
+ :description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
12
+ }
13
+
14
+ Settings.declare :min_sequence_count, {
15
+ :default => 3,
16
+ :module => self.name,
17
+ :description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
18
+ }
19
+
20
+ Settings.declare :max_reference_order, {
21
+ :default => 1000,
22
+ :module => self.name,
23
+ :description => "References whose number would be greater than :max_reference_order are ignored. This helps avoid confusing year literals with reference numbers."
24
+ }
25
+
26
+ Settings.declare :min_lateness, {
27
+ :default => 0.5,
28
+ :module => self.name,
29
+ :description => "Article sections that appear early in an article will not be considered as candidate reference sections. :min_lateness is a value from 0 to 1, where 0 represents the start of an article, 1 the end."
30
+ }
12
31
 
13
32
  def self.partition_by ary, &block
14
33
  matching = []
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 7
9
- version: 0.0.7
8
+ - 8
9
+ version: 0.0.8
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-11-09 00:00:00 +00:00
17
+ date: 2011-11-10 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency