RubyGems - pdf-extract - Versions diffs - 0.0.7 → 0.0.8 - Mend

pdf-extract 0.0.7 → 0.0.8

Files changed (10) hide show

data/bin/pdf-extract +14 -4
data/lib/analysis/columns.rb +11 -2
data/lib/analysis/sections.rb +5 -1
data/lib/analysis/titles.rb +5 -1
data/lib/analysis/zones.rb +5 -2
data/lib/model/chunks.rb +17 -6
data/lib/model/regions.rb +6 -1
data/lib/pdf.rb +9 -3
data/lib/references/references.rb +23 -4
metadata +3 -3

data/bin/pdf-extract CHANGED

@@ -129,6 +129,10 @@ commands.each do |cmd|
   end
 end
+def norm_name k
+  k.sub /.+\:\:/, ""
+end
 command :settings do |c|
   c.syntax = "pdf-extract settings [options]"
   c.description = "Print settings that pdf-extract will use to screen."
@@ -137,10 +141,16 @@ command :settings do |c|
     pdf = PdfExtract::Pdf.new
     apply_settings pdf
     s = pdf.settings
-    say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
-    s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
-    say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
-    s.modified.each_pair { |k, v| say "#{k}:\t#{v}   (#{s.agent(k)})" }
+    if s.modified.empty?
+      s.unmodified.each_pair { |k, v| say "====================\n<%= color('#{norm_name(v[:module])}', BOLD) %> declares <%= color('#{k}', BOLD) %>, default = <%= color('#{v[:default]}', BOLD) %>\n\n#{v[:description]}\n====================\n\n" }
+    else
+      say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
+      s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
+      say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
+      s.modified.each_pair { |k, v| say "#{k}:\t#{v}   (#{s.agent(k)})" }
+    end
   end
 end

data/lib/analysis/columns.rb CHANGED

@@ -1,8 +1,17 @@
 module PdfExtract
   module Columns
-    Settings.default :column_sample_count, 8
-    Settings.default :max_column_count, 3
+    Settings.declare :column_sample_count, {
+      :default => 8,
+      :module => self.name,
+      :description => "Columns are detected by sampling :column_sample_count lines across a page and examing the number of regions incident with each line."
+    }
+    Settings.declare :max_column_count, {
+      :default => 3,
+      :module => self.name,
+      :description => "The maximum number of columns that can ever occur. During column detection column counts larger than :max_column_count will be disregarded."
+    }
     def self.columns_at y, body_regions
       x_mask = MultiRange.new

data/lib/analysis/sections.rb CHANGED

@@ -5,7 +5,11 @@ require_relative '../kmeans'
 module PdfExtract
   module Sections
-    Settings.default :width_ratio, 0.9
+    Settings.declare :width_ratio, {
+      :default => 0.9,
+      :module => self.name,
+      :description => "Minimum ratio of text region width to containing column width for a text region to be considered as part of an article section."
+    }
     def self.match? a, b
       lh = a[:line_height].round(2) == b[:line_height].round(2)

data/lib/analysis/titles.rb CHANGED

@@ -3,7 +3,11 @@ require_relative "../spatial"
 module PdfExtract
   module Titles
-    Settings.default :title_slop, 0.2
+    Settings.declare :title_slop, {
+      :default => 0.2,
+      :module => self.name,
+      :description => "Regions of text whose font size is less than :title_slop percent of the largest font size in a PDF will be disregarded as candidate titles. Value must be 0 - 1."
+    }
     def self.include_in pdf
       pdf.spatials :titles, :depends_on => [:regions] do |parser|

data/lib/analysis/zones.rb CHANGED

@@ -5,8 +5,11 @@ module PdfExtract
     # distance from margins. Should be within a factor of the body
     # area.
-    # Ratio of marginless page height to minimum body height.
-    Settings.default :body_ratio, 0.9
+    Settings.declare :body_ratio, {
+      :default => 0.9,
+      :module => "Bodies, Headers, Footers",
+      :description => "Minium permitted ratio of page height to candidate body zone height. When detecting headers, footers and body (area between header and footer) zones, candidate header and footer areas will be disregarded if they imply a body area whose height to page height ratio is less than :body_ratio."
+    }
     def self.include_in pdf
       deps = [:top_margins, :left_margins, :right_margins, :bottom_margins, :regions]

data/lib/model/chunks.rb CHANGED

@@ -5,14 +5,25 @@ module PdfExtract
     # TODO Look for obj[:writing_mode] == :vertical or :horizontal
-    Settings.default :char_slop, 0.2
-    Settings.default :word_slop, 4.0
-    Settings.default :overlap_slop, 0.9
+    Settings.declare :char_slop, {
+      :default => 0.2,
+      :module => self.name,
+      :description => "Maximum allowed space between characters for them to be considered part of the same word. char_slop is multiplied by the width of each character to find its joining width."
+    }
+    Settings.declare :word_slop, {
+      :default => 4.0,
+      :module => self.name,
+      :description => "Maximum allowed space between words for them to be considered part of the same line. word_slop is multiplied by width of the last character in a word to find its joining width."
+    }
+    Settings.declare :overlap_slop, {
+      :default => 0.9,
+      :module => self.name,
+      :description => "A minimum permitted ratio of the overlapped height of words for them to join together into lines."
+    }
     def self.include_in pdf
-      char_slop = 0.2
-      word_slop = 4.0
-      overlap_slop = 0.9
       pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
         rows = {}

data/lib/model/regions.rb CHANGED

@@ -3,7 +3,12 @@ require_relative '../spatial'
 module PdfExtract
   module Regions
-    Settings.default :line_slop, 1.0
+    Settings.declare :line_slop, {
+      :default => 1.0,
+      :module => self.name,
+      :description => "Maximum allowed line spacing between lines that are considered
+to be part of the same region. :line_slop is multiplied by the average line height of a region to find a maximum line spacing between a region and a candidate line."
+    }
     # TODO Handle :writing_mode once present in characters and text_chunks.

data/lib/pdf.rb CHANGED

@@ -6,8 +6,13 @@ module PdfExtract
     @@defaults = {}
-    def self.default key, default_value
-      @@defaults[key] = default_value
+    def self.declare key, opts={}
+      default_hash = {
+        :default => "",
+        :description => "",
+        :module => ""
+      }.merge(opts)
+      @@defaults[key] = default_hash
     end
     def initialize
@@ -16,7 +21,8 @@ module PdfExtract
     end
     def [] key
-      @settings[key] || @@defaults[key] ||
+      @settings[key] ||
+        (@@defaults[key] && @@defaults[key][:default]) ||
         raise("Attempt to use undeclared setting \"#{key}\"")
     end

data/lib/references/references.rb CHANGED

@@ -5,10 +5,29 @@ require_relative "score"
 module PdfExtract
   module References
-    Settings.default :reference_flex, 0.1
-    Settings.default :min_sequence_count, 3
-    Settings.default :max_reference_order, 1000
-    Settings.default :min_lateness , 0.5
+    Settings.declare :reference_flex, {
+      :default => 0.1,
+      :module => self.name,
+      :description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
+    }
+    Settings.declare :min_sequence_count, {
+      :default => 3,
+      :module => self.name,
+      :description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
+    }
+    Settings.declare :max_reference_order, {
+      :default => 1000,
+      :module => self.name,
+      :description => "References whose number would be greater than :max_reference_order are ignored. This helps avoid confusing year literals with reference numbers."
+    }
+    Settings.declare :min_lateness, {
+      :default => 0.5,
+      :module => self.name,
+      :description => "Article sections that appear early in an article will not be considered as candidate reference sections. :min_lateness is a value from 0 to 1, where 0 represents the start of an article, 1 the end."
+    }
     def self.partition_by ary, &block
       matching = []

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 7
-  version: 0.0.7
+  - 8
+  version: 0.0.8
 platform: ruby
 authors:
 - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-09 00:00:00 +00:00
+date: 2011-11-10 00:00:00 +00:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency