pdf-struct 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/pdf/extractor.rb +127 -0
- metadata +46 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'rexml/streamlistener'
|
3
|
+
|
4
|
+
module PDF
|
5
|
+
module Extractor
|
6
|
+
class ConversionError < RuntimeError; end
|
7
|
+
class MalformedPDFError < RuntimeError; end
|
8
|
+
|
9
|
+
def self.open(path)
|
10
|
+
input = `pdftohtml -enc UTF-8 -xml -stdout #{path} 2>&1`
|
11
|
+
case input
|
12
|
+
#when /command not found/
|
13
|
+
# raise ConversionError, 'pdftohtml command not found'
|
14
|
+
when /PDF file is damaged/
|
15
|
+
raise MalformedPDFError, "the PDF with filename '#{path}' is malformed"
|
16
|
+
when /Couldn't open file/
|
17
|
+
raise RuntimeError, "Couldn't open file: '#{path}'"
|
18
|
+
else
|
19
|
+
PDF::Extractor::Document.new(input)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class PDF::Extractor::Element
|
26
|
+
attr_reader :top, :left, :width, :height, :font
|
27
|
+
attr_accessor :content
|
28
|
+
|
29
|
+
def initialize(params = {})
|
30
|
+
@top = params[:top]
|
31
|
+
@left = params[:left]
|
32
|
+
@width = params[:width]
|
33
|
+
@height = params[:height]
|
34
|
+
@font = params[:font]
|
35
|
+
@content = params[:content]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class PDF::Extractor::Font
|
40
|
+
attr_reader :id, :name, :size
|
41
|
+
attr_accessor :style
|
42
|
+
|
43
|
+
def initialize(params = {})
|
44
|
+
@id = params[:id]
|
45
|
+
@size = params[:size].to_f
|
46
|
+
@name = params[:name]
|
47
|
+
@style = :normal
|
48
|
+
end
|
49
|
+
|
50
|
+
def normal?; @style == :normal end
|
51
|
+
def bold?; @style == :bold end
|
52
|
+
def italic?; @style == :italic end
|
53
|
+
end
|
54
|
+
|
55
|
+
class PDF::Extractor::Page
|
56
|
+
attr_reader :elements, :width, :height
|
57
|
+
|
58
|
+
def initialize(params = {})
|
59
|
+
@width = params[:width]
|
60
|
+
@height = params[:height]
|
61
|
+
@elements = []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class PDF::Extractor::Reader
|
66
|
+
include REXML::StreamListener
|
67
|
+
|
68
|
+
attr_reader :pages, :fonts
|
69
|
+
|
70
|
+
def initialize
|
71
|
+
@pages, @fonts = [], []
|
72
|
+
end
|
73
|
+
|
74
|
+
def tag_start(name, attributes)
|
75
|
+
@in_text = false
|
76
|
+
case name
|
77
|
+
when 'page'
|
78
|
+
@pages << PDF::Extractor::Page.new(
|
79
|
+
:width => attributes['width'].to_f,
|
80
|
+
:height => attributes['height'].to_f
|
81
|
+
)
|
82
|
+
when 'fontspec'
|
83
|
+
@fonts << PDF::Extractor::Font.new(
|
84
|
+
:id => attributes['id'],
|
85
|
+
:size => attributes['size'].to_f + 2, # is this right?
|
86
|
+
:name => attributes['family']
|
87
|
+
)
|
88
|
+
when 'text'
|
89
|
+
@in_text = true
|
90
|
+
@pages.last.elements << PDF::Extractor::Element.new(
|
91
|
+
:top => attributes['top'].to_f,
|
92
|
+
:left => attributes['left'].to_f,
|
93
|
+
:width => attributes['width'].to_f,
|
94
|
+
:height => attributes['height'].to_f,
|
95
|
+
:font => @fonts.find{|n| n.id == attributes['font']}
|
96
|
+
)
|
97
|
+
when 'b'
|
98
|
+
@in_text = true
|
99
|
+
@pages.last.elements.last.font.style = :bold
|
100
|
+
when 'i'
|
101
|
+
@in_text = true
|
102
|
+
@pages.last.elements.last.font.style = :italic
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def text(str)
|
107
|
+
@pages.last.elements.last.content = str if @in_text and str =~ /\S/
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class PDF::Extractor::Document
|
112
|
+
attr_reader :pages
|
113
|
+
|
114
|
+
def initialize(source)
|
115
|
+
populate source
|
116
|
+
end
|
117
|
+
|
118
|
+
def elements; @pages.map{|n| n.elements}.flatten end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def populate(source)
|
123
|
+
listener = PDF::Extractor::Reader.new
|
124
|
+
REXML::Parsers::StreamParser.new(source, listener).parse
|
125
|
+
@pages, @fonts = listener.pages, listener.fonts
|
126
|
+
end
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdf-struct
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Erik Terpstra
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-10 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: PDF::Extractor is a library that provides high level access to the text
|
15
|
+
objects of a PDF document.
|
16
|
+
email: unknown@email.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/pdf/extractor.rb
|
22
|
+
homepage: http://rubygems.org/gems/pdf-struct
|
23
|
+
licenses: []
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 1.8.16
|
43
|
+
signing_key:
|
44
|
+
specification_version: 3
|
45
|
+
summary: PDF::Extractor
|
46
|
+
test_files: []
|