RubyGems - pdf-extract - Versions diffs - 0.0.1 - Mend

pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/bin/assign.rb +72 -0
data/bin/config.json +4 -0
data/bin/fac_v19n11_s5.mask.pdf +0 -0
data/bin/margins.mask.pdf +0 -0
data/bin/one-column.mask.pdf +24110 -39
data/bin/pdf-extract +146 -0
data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
data/bin/some3.mask.pdf +0 -0
data/bin/some5.mask.pdf +0 -0
data/bin/some6.mask.pdf +0 -0
data/bin/train.rb +48 -0
data/bin/two-column.mask.pdf +0 -0
data/data/familynames.db +0 -0
data/data/stopwords.txt +1 -0
data/lib/analysis/columns.rb +75 -0
data/lib/analysis/margins.rb +84 -0
data/lib/analysis/sections.rb +156 -0
data/lib/analysis/titles.rb +53 -0
data/lib/analysis/zones.rb +128 -0
data/lib/font_metrics.rb +240 -0
data/lib/kmeans.rb +114 -0
data/lib/language.rb +58 -0
data/lib/model/characters.rb +320 -0
data/lib/model/chunks.rb +103 -0
data/lib/model/regions.rb +112 -0
data/lib/multi_range.rb +69 -0
data/lib/names.rb +85 -0
data/lib/pdf-extract.rb +77 -0
data/lib/pdf.rb +255 -0
data/lib/references/references.rb +184 -0
data/lib/references/resolve.rb +113 -0
data/lib/references/resolved_references.rb +37 -0
data/lib/spatial.rb +188 -0
data/lib/view/abstract_view.rb +32 -0
data/lib/view/pdf_view.rb +43 -0
data/lib/view/png_view.rb +30 -0
data/lib/view/xml_view.rb +113 -0
metadata +208 -0

data/bin/pdf-extract ADDED Viewed

@@ -0,0 +1,146 @@
+#!/usr/bin/env ruby
+require 'commander/import'
+require 'json'
+require_relative '../lib/pdf-extract'
+require_relative '../lib/references/resolve'
+program :name, 'pdf-extract'
+program :version, '0.0.1'
+program :description, 'PDF content extraction toolkit'
+semantic = ['resolved_references', 'references', 'titles', 'sections']
+margins = ['top_margins', 'bottom_margins', 'left_margins', 'right_margins']
+zones = ['headers', 'footers', 'bodies']
+objects = ['characters', 'chunks', 'regions', 'columns'] + semantic + margins + zones
+resolvers = {
+  "sigg" => PdfExtract::Resolve::Sigg,
+  "freecite" => PdfExtract::Resolve::FreeCite,
+  "stq" => PdfExtract::Resolve::SimpleTextQuery
+}
+outputs = {
+  :xml => proc { :stdout },
+  :pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" },
+  :png => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.png" }
+}
+commands = [
+  {
+    :name => "extract",
+    :view => :xml,
+    :description => "Extract objects as XML."
+  },
+  {
+    :name => "mark",
+    :view => :pdf,
+    :description => "Highlight bounding boxes of objects in a PDF."
+  },
+  {
+    :name => "annotate",
+    :view => :not_implemented,
+    :description => "Annotate a PDF with attributes of extracted objects."
+  }
+]
+$chosen_objects = []
+$render_options = {}
+$overrides = {}
+objects.each do |o|
+  global_option "--#{o}" do |_|
+    $chosen_objects << o
+  end
+end
+global_option "--semantic" do |_| $chosen_objects += semantic end
+global_option "--margins" do |_| $chosen_objects += margins end
+global_option "--zones" do |_| $chosen_objects += zones end
+global_option "--resolvers RESOLVERS" do |chosen_resolvers|
+  chosen_resolvers = chosen_resolvers.split ","
+  chosen_resolvers.each do |name|
+    fail "No such resolver #{resolver}" unless resolvers.key? name
+  end
+  chosen_resolvers.map! { |name| resolvers[name] }
+  PdfExtract::Resolve.resolvers = chosen_resolvers
+end
+global_option "--output FILE" do |filename|
+  $output = filename
+end
+global_option "--no-lines" do |_|
+  $render_options[:lines] = false
+end
+global_option "--precision DIGITS" do |digits|
+  $render_options[:round] = digits.to_i
+end
+global_option "--outline" do |_|
+  $render_options[:outline] = true
+end
+global_option "--set SETTING:VALUE" do |s|
+  (name, value) = s.split ":"
+  $overrides[name] = value
+end
+global_option "--config CONFIG_FILE" do |filename|
+  $config = filename
+end
+def self.apply_settings pdf
+  if not $config.nil?
+    conf = JSON.parse File.open($config, "r").read
+    conf.each_pair do |setting, value|
+      pdf.set setting.to_sym, value, $config
+    end
+  end
+  $overrides.each_pair { |k,v| pdf.set k.to_sym, v, "command line" }
+end
+commands.each do |cmd|
+  command cmd[:name].to_sym do |c|
+    c.syntax = "pdf-extract #{cmd[:name]} [options] filename"
+    c.description = cmd[:description]
+    c.action do |args, options|
+      args.each do |filename|
+        $output = outputs[cmd[:view]].call(filename) if $output.nil?
+        opts = {:as => cmd[:view]}.merge $render_options
+        out = PdfExtract.view filename, opts do |pdf|
+          apply_settings pdf
+          $chosen_objects.each { |name| pdf.send name.to_sym }
+        end
+        if $output == :stdout
+          say out
+        else
+          PdfExtract.view_class(cmd[:view]).write(out, $output)
+        end
+      end
+    end
+  end
+end
+command :settings do |c|
+  c.syntax = "pdf-extract settings [options]"
+  c.description = "Print settings that pdf-extract will use to screen."
+  c.action do |args, options|
+    pdf = PdfExtract::Pdf.new
+    apply_settings pdf
+    s = pdf.settings
+    say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
+    s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }
+    say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
+    s.modified.each_pair { |k, v| say "#{k}:\t#{v}   (#{s.agent(k)})" }
+  end
+end

data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf ADDED Viewed

File without changes

data/bin/some3.mask.pdf ADDED Viewed

Binary file

data/bin/some5.mask.pdf ADDED Viewed

Binary file

data/bin/some6.mask.pdf ADDED Viewed

Binary file

data/bin/train.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# Train ideal attributes based on example input.
+require_relative "../lib/language"
+variables = {
+  :name_ratio => method(PdfExtract::Language::name_ratio),
+  :letter_ratio => method(PdfExtract::Language::letter_ratio),
+  :year_ratio => method(PdfExtract::Language::year_ratio)
+}
+results = {}
+sums = {}
+variables.each_pair do |k, _|
+  sums[k] = 0
+  results[k] = []
+end
+count = 0
+File.open(ARGV[0]).read.lines.each do |line|
+  variables.each_pair do |var, fn|
+    val = fn.call(line)
+    results[var] << val
+    sums[var] = val
+  end
+  count = count.next
+end
+avgs = {}
+sums.each_pair { |k, _| avgs[k] = sums[k] / count }
+deviations = {}
+results.each_pair do |name, vals|
+  deviations[name] = results[name].map { |val| (args[name - val]) ** 2 }
+end
+std_deviations = {}
+deviations.each_pair do |name, vals|
+  sum = 0
+  vals.each { |val| sum += val }
+  std_deviations[name] = (sum / (count - 1).to_f).sqrt
+end
+puts "Averages"
+puts avgs
+puts "Standard deviations"
+puts std_deviations

data/bin/two-column.mask.pdf ADDED Viewed

Binary file

data/data/familynames.db ADDED Viewed

Binary file

data/data/stopwords.txt ADDED Viewed

@@ -0,0 +1 @@

+ a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,association,company,org,organisation,president,vice,nobel,prize,medicine,biology,physics,chemistry,laboratories,labs

data/lib/analysis/columns.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module PdfExtract
+  module Columns
+    Settings.default :column_sample_count, 8
+    Settings.default :max_column_count, 3
+    def self.columns_at y, body_regions
+      x_mask = MultiRange.new
+      body_regions.each do |region|
+        if region[:y] <= y && (region[:y] + region[:height]) >= y
+          x_mask.append(region[:x] .. (region[:x] + region[:width]))
+        end
+      end
+      x_mask
+    end
+    def self.include_in pdf
+      deps = [:regions, :bodies]
+      pdf.spatials :columns, :paged => true, :depends_on => deps do |parser|
+        body = nil
+        body_regions = []
+        parser.before do
+          body_regions = []
+        end
+        parser.objects :bodies do |b|
+          body = b
+        end
+        parser.objects :regions do |region|
+          if Spatial.contains? body, region
+            body_regions << region
+          end
+        end
+        parser.after do
+          column_sample_count = pdf.settings[:column_sample_count]
+          step = 1.0 / (column_sample_count + 1)
+          column_ranges = []
+          (1 .. column_sample_count).each do |i|
+            y = body[:y] + (body[:height] * i * step)
+            column_ranges << columns_at(y, body_regions)
+          end
+          # Discard those with more than x columns. They've probably hit a table.
+          column_ranges.reject! { |r| r.count > pdf.settings[:max_column_count] }
+          if column_ranges.count.zero?
+            []
+          else
+            # Find the highest column count.
+            most = column_ranges.max_by { |r| r.count }.count
+            column_ranges.reject! { |r| r.count != most }
+            # Take the columns that are widest.
+            widest = column_ranges.map { |r| r.avg }.max
+            column_ranges.reject! { |r| r.avg < widest }
+            column_ranges.first.ranges.map do |range|
+              body.merge({:x => range.min, :width => range.max - range.min })
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/analysis/margins.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require_relative '../multi_range'
+module PdfExtract
+  module Margins
+    def self.axis_spatials pdf, name, axis
+      pdf.spatials name, :paged => true, :depends_on => [:regions] do |parser|
+        axis_mask = MultiRange.new
+        page = -1
+        page_width = 0
+        page_height = 0
+        dimension = :width if axis == :x
+        dimension = :height if axis == :y
+        parser.before do
+          axis_mask = MultiRange.new
+          page = -1
+        end
+        parser.objects :regions do |region|
+          if page == -1
+            page = region[:page]
+            page_width = region[:page_width]
+            page_height = region[:page_height]
+          end
+          axis_mask.append region[axis]..(region[axis]+region[dimension])
+        end
+        parser.after do
+          if axis_mask.count.zero?
+            nil
+          else
+            yield axis_mask, {
+              :page => page,
+              :page_width => page_width,
+              :page_height => page_height
+            }
+          end
+        end
+      end
+    end
+    def self.include_in pdf
+      axis_spatials pdf, :top_margins, :y do |y_mask, obj|
+        obj.merge({
+          :x => 0,
+          :y => y_mask.max,
+          :width => obj[:page_width],
+          :height => obj[:page_height] - y_mask.max
+        })
+      end
+      axis_spatials pdf, :bottom_margins, :y do |y_mask, obj|
+        obj.merge({
+          :x => 0,
+          :y => 0,
+          :width => obj[:page_width],
+          :height => y_mask.min
+        })
+      end
+      axis_spatials pdf, :left_margins, :x do |x_mask, obj|
+        obj.merge({
+          :x => 0,
+          :y => 0,
+          :width => x_mask.min,
+          :height => obj[:page_height]
+        })
+      end
+      axis_spatials pdf, :right_margins, :x do |x_mask, obj|
+        obj.merge({
+          :x => x_mask.max,
+          :y => 0,
+          :width => obj[:page_width] - x_mask.max,
+          :height => obj[:page_height]
+        })
+      end
+    end
+  end
+end

data/lib/analysis/sections.rb ADDED Viewed

@@ -0,0 +1,156 @@
+require_relative '../language'
+require_relative '../spatial'
+require_relative '../kmeans'
+module PdfExtract
+  module Sections
+    Settings.default :width_ratio, 0.9
+    def self.match? a, b
+      lh = a[:line_height].round(2) == b[:line_height].round(2)
+      f = a[:font] == b[:font]
+      lh && f
+    end
+    def self.candidate? pdf, region, column
+      # Regions that make up sections or headers must be
+      # both less width than their column width and,
+      # unless they are a single line, must be within the
+      # width_ratio.
+      width_ratio = pdf.settings[:width_ratio]
+      within_column = region[:width] <= column[:width]
+      within_column && (region[:width].to_f / column[:width]) >= width_ratio
+    end
+    def self.reference_cluster clusters
+      # Find the cluster with name_ratio closest to 0.1
+      # Those are our reference sections.
+      ideal = 0.1
+      ref_cluster = nil
+      smallest_diff = 1
+      clusters.each do |cluster|
+        diff = (cluster[:centre][:name_ratio] - ideal).abs
+        if diff < smallest_diff
+          ref_cluster = cluster
+          smallest_diff = diff
+        end
+      end
+      ref_cluster
+    end
+    def self.clusters_to_spatials clusters
+      clusters.map do |cluster|
+        cluster[:items].each do |item|
+          centre = cluster[:centre].values.map { |v| v.round(3) }.join ", "
+          item[:centre] = centre
+        end
+        cluster[:items]
+      end.flatten
+    end
+    def self.add_content_stats sections
+      sections.map do |section|
+        content = Spatial.get_text_content section
+        Spatial.drop_spatial(section).merge({
+          :letter_ratio => Language.letter_ratio(content),
+          :year_ratio => Language.year_ratio(content),                                            :cap_ratio => Language.cap_ratio(content),
+          :name_ratio => Language.name_ratio(content),
+          :word_count => Language.word_count(content)
+        })
+      end
+    end
+    def self.include_in pdf
+      pdf.spatials :sections, :depends_on => [:regions, :columns] do |parser|
+        columns = []
+        parser.objects :columns do |column|
+          columns << {:column => column, :regions => []}
+        end
+        parser.objects :regions do |region|
+          containers = columns.reject do |c|
+            column = c[:column]
+            not (column[:page] == region[:page] && Spatial.contains?(column, region))
+          end
+          containers.first[:regions] << region unless containers.count.zero?
+        end
+        parser.after do
+          # Sort regions in each column from highest to lowest.
+          columns.each do |c|
+            c[:regions].sort_by! { |r| -r[:y] }
+          end
+          # Group columns into pages.
+          pages = {}
+          columns.each do |c|
+            pages[c[:column][:page]] ||= []
+            pages[c[:column][:page]] << c
+          end
+          # Sort bodies on each page from x left to right.
+          pages.each_pair do |page, columns|
+            columns.sort_by! { |c| c[:column][:x] }
+          end
+          sections = []
+          found = []
+          pages.each_pair do |page, columns|
+            columns.each do |c|
+              column = c[:column]
+              c[:regions].each do |region|
+                if candidate? pdf, region, column
+                  if !found.last.nil? && match?(found.last, region)
+                    content = Spatial.merge_lines(found.last, region, {})
+                    found.last.merge!(content)
+                  else
+                    found << region
+                  end
+                else
+                  sections = sections + found
+                  found = []
+                end
+              end
+            end
+          end
+          sections = sections + found
+          # We now have sections. Add information to them.
+          # add_content_types sections
+          sections = add_content_stats sections
+          # Score sections into categories based on their textual attributes.
+          ideals = {
+            :reference => {
+              :name_ratio => [0.2, 5],
+              :letter_ratio => [0.25, 2],
+              :year_ratio => [0.05, 7]
+            },
+            :body => {
+              :name_ratio => [0.03, 1],
+              :letter_ratio => [0.1, 1],
+              :year_ratio => [0.0, 1]
+            }
+          }
+          Spatial.score(sections, ideals)
+          sections
+        end
+      end
+    end
+  end
+end

data/lib/analysis/titles.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require_relative "../spatial"
+module PdfExtract
+  module Titles
+    Settings.default :title_slop, 0.2
+    def self.include_in pdf
+      pdf.spatials :titles, :depends_on => [:regions] do |parser|
+        titles = []
+        parser.objects :regions do |region|
+          titles << region
+        end
+        parser.after do
+          # A title should:
+          #   be longer than one letter,
+          titles.reject! { |r| Spatial.get_text_content(r).strip.length < 2}
+          #   be in the top half of a page,
+          titles.reject! { |r| r[:y] < (r[:page_height] / 2.0) }
+          #   be no less tall than a factor of the tallest text,
+          titles.sort_by! { |r| -r[:line_height] }
+          tallest_line = titles.first[:line_height]
+          title_slop = tallest_line - (tallest_line * pdf.settings[:title_slop])
+          titles.reject! { |r| r[:line_height] < title_slop }
+          #   be on the earliest page with text,
+          titles.sort_by! { |r| r[:page] }
+          first_page = titles.first[:page]
+          titles.reject! { |r| r[:page] != first_page }
+          #   be the highest of the above.
+          titles.sort_by! { |r| -r[:y] }
+          if titles.count.zero?
+            []
+          else
+            {
+              :content => Spatial.get_text_content(titles.first),
+              :line_height => titles.first[:line_height],
+              :font => titles.first[:font]
+            }
+          end
+        end
+      end
+    end
+  end
+end