pdf-struct 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/pdf/extractor.rb +127 -0
  2. metadata +46 -0
@@ -0,0 +1,127 @@
1
+ require 'rexml/document'
2
+ require 'rexml/streamlistener'
3
+
4
+ module PDF
5
+ module Extractor
6
+ class ConversionError < RuntimeError; end
7
+ class MalformedPDFError < RuntimeError; end
8
+
9
+ def self.open(path)
10
+ input = `pdftohtml -enc UTF-8 -xml -stdout #{path} 2>&1`
11
+ case input
12
+ #when /command not found/
13
+ # raise ConversionError, 'pdftohtml command not found'
14
+ when /PDF file is damaged/
15
+ raise MalformedPDFError, "the PDF with filename '#{path}' is malformed"
16
+ when /Couldn't open file/
17
+ raise RuntimeError, "Couldn't open file: '#{path}'"
18
+ else
19
+ PDF::Extractor::Document.new(input)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class PDF::Extractor::Element
26
+ attr_reader :top, :left, :width, :height, :font
27
+ attr_accessor :content
28
+
29
+ def initialize(params = {})
30
+ @top = params[:top]
31
+ @left = params[:left]
32
+ @width = params[:width]
33
+ @height = params[:height]
34
+ @font = params[:font]
35
+ @content = params[:content]
36
+ end
37
+ end
38
+
39
+ class PDF::Extractor::Font
40
+ attr_reader :id, :name, :size
41
+ attr_accessor :style
42
+
43
+ def initialize(params = {})
44
+ @id = params[:id]
45
+ @size = params[:size].to_f
46
+ @name = params[:name]
47
+ @style = :normal
48
+ end
49
+
50
+ def normal?; @style == :normal end
51
+ def bold?; @style == :bold end
52
+ def italic?; @style == :italic end
53
+ end
54
+
55
+ class PDF::Extractor::Page
56
+ attr_reader :elements, :width, :height
57
+
58
+ def initialize(params = {})
59
+ @width = params[:width]
60
+ @height = params[:height]
61
+ @elements = []
62
+ end
63
+ end
64
+
65
+ class PDF::Extractor::Reader
66
+ include REXML::StreamListener
67
+
68
+ attr_reader :pages, :fonts
69
+
70
+ def initialize
71
+ @pages, @fonts = [], []
72
+ end
73
+
74
+ def tag_start(name, attributes)
75
+ @in_text = false
76
+ case name
77
+ when 'page'
78
+ @pages << PDF::Extractor::Page.new(
79
+ :width => attributes['width'].to_f,
80
+ :height => attributes['height'].to_f
81
+ )
82
+ when 'fontspec'
83
+ @fonts << PDF::Extractor::Font.new(
84
+ :id => attributes['id'],
85
+ :size => attributes['size'].to_f + 2, # is this right?
86
+ :name => attributes['family']
87
+ )
88
+ when 'text'
89
+ @in_text = true
90
+ @pages.last.elements << PDF::Extractor::Element.new(
91
+ :top => attributes['top'].to_f,
92
+ :left => attributes['left'].to_f,
93
+ :width => attributes['width'].to_f,
94
+ :height => attributes['height'].to_f,
95
+ :font => @fonts.find{|n| n.id == attributes['font']}
96
+ )
97
+ when 'b'
98
+ @in_text = true
99
+ @pages.last.elements.last.font.style = :bold
100
+ when 'i'
101
+ @in_text = true
102
+ @pages.last.elements.last.font.style = :italic
103
+ end
104
+ end
105
+
106
+ def text(str)
107
+ @pages.last.elements.last.content = str if @in_text and str =~ /\S/
108
+ end
109
+ end
110
+
111
+ class PDF::Extractor::Document
112
+ attr_reader :pages
113
+
114
+ def initialize(source)
115
+ populate source
116
+ end
117
+
118
+ def elements; @pages.map{|n| n.elements}.flatten end
119
+
120
+ private
121
+
122
+ def populate(source)
123
+ listener = PDF::Extractor::Reader.new
124
+ REXML::Parsers::StreamParser.new(source, listener).parse
125
+ @pages, @fonts = listener.pages, listener.fonts
126
+ end
127
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-struct
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Erik Terpstra
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-10 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: PDF::Extractor is a library that provides high level access to the text
15
+ objects of a PDF document.
16
+ email: unknown@email.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/pdf/extractor.rb
22
+ homepage: http://rubygems.org/gems/pdf-struct
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.16
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: PDF::Extractor
46
+ test_files: []