pdf-struct 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/pdf/extractor.rb +127 -0
- metadata +46 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'rexml/streamlistener'
|
3
|
+
|
4
|
+
module PDF
|
5
|
+
module Extractor
|
6
|
+
class ConversionError < RuntimeError; end
|
7
|
+
class MalformedPDFError < RuntimeError; end
|
8
|
+
|
9
|
+
def self.open(path)
|
10
|
+
input = `pdftohtml -enc UTF-8 -xml -stdout #{path} 2>&1`
|
11
|
+
case input
|
12
|
+
#when /command not found/
|
13
|
+
# raise ConversionError, 'pdftohtml command not found'
|
14
|
+
when /PDF file is damaged/
|
15
|
+
raise MalformedPDFError, "the PDF with filename '#{path}' is malformed"
|
16
|
+
when /Couldn't open file/
|
17
|
+
raise RuntimeError, "Couldn't open file: '#{path}'"
|
18
|
+
else
|
19
|
+
PDF::Extractor::Document.new(input)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class PDF::Extractor::Element
|
26
|
+
attr_reader :top, :left, :width, :height, :font
|
27
|
+
attr_accessor :content
|
28
|
+
|
29
|
+
def initialize(params = {})
|
30
|
+
@top = params[:top]
|
31
|
+
@left = params[:left]
|
32
|
+
@width = params[:width]
|
33
|
+
@height = params[:height]
|
34
|
+
@font = params[:font]
|
35
|
+
@content = params[:content]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class PDF::Extractor::Font
|
40
|
+
attr_reader :id, :name, :size
|
41
|
+
attr_accessor :style
|
42
|
+
|
43
|
+
def initialize(params = {})
|
44
|
+
@id = params[:id]
|
45
|
+
@size = params[:size].to_f
|
46
|
+
@name = params[:name]
|
47
|
+
@style = :normal
|
48
|
+
end
|
49
|
+
|
50
|
+
def normal?; @style == :normal end
|
51
|
+
def bold?; @style == :bold end
|
52
|
+
def italic?; @style == :italic end
|
53
|
+
end
|
54
|
+
|
55
|
+
class PDF::Extractor::Page
|
56
|
+
attr_reader :elements, :width, :height
|
57
|
+
|
58
|
+
def initialize(params = {})
|
59
|
+
@width = params[:width]
|
60
|
+
@height = params[:height]
|
61
|
+
@elements = []
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class PDF::Extractor::Reader
|
66
|
+
include REXML::StreamListener
|
67
|
+
|
68
|
+
attr_reader :pages, :fonts
|
69
|
+
|
70
|
+
def initialize
|
71
|
+
@pages, @fonts = [], []
|
72
|
+
end
|
73
|
+
|
74
|
+
def tag_start(name, attributes)
|
75
|
+
@in_text = false
|
76
|
+
case name
|
77
|
+
when 'page'
|
78
|
+
@pages << PDF::Extractor::Page.new(
|
79
|
+
:width => attributes['width'].to_f,
|
80
|
+
:height => attributes['height'].to_f
|
81
|
+
)
|
82
|
+
when 'fontspec'
|
83
|
+
@fonts << PDF::Extractor::Font.new(
|
84
|
+
:id => attributes['id'],
|
85
|
+
:size => attributes['size'].to_f + 2, # is this right?
|
86
|
+
:name => attributes['family']
|
87
|
+
)
|
88
|
+
when 'text'
|
89
|
+
@in_text = true
|
90
|
+
@pages.last.elements << PDF::Extractor::Element.new(
|
91
|
+
:top => attributes['top'].to_f,
|
92
|
+
:left => attributes['left'].to_f,
|
93
|
+
:width => attributes['width'].to_f,
|
94
|
+
:height => attributes['height'].to_f,
|
95
|
+
:font => @fonts.find{|n| n.id == attributes['font']}
|
96
|
+
)
|
97
|
+
when 'b'
|
98
|
+
@in_text = true
|
99
|
+
@pages.last.elements.last.font.style = :bold
|
100
|
+
when 'i'
|
101
|
+
@in_text = true
|
102
|
+
@pages.last.elements.last.font.style = :italic
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def text(str)
|
107
|
+
@pages.last.elements.last.content = str if @in_text and str =~ /\S/
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class PDF::Extractor::Document
|
112
|
+
attr_reader :pages
|
113
|
+
|
114
|
+
def initialize(source)
|
115
|
+
populate source
|
116
|
+
end
|
117
|
+
|
118
|
+
def elements; @pages.map{|n| n.elements}.flatten end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def populate(source)
|
123
|
+
listener = PDF::Extractor::Reader.new
|
124
|
+
REXML::Parsers::StreamParser.new(source, listener).parse
|
125
|
+
@pages, @fonts = listener.pages, listener.fonts
|
126
|
+
end
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdf-struct
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Erik Terpstra
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-10 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: PDF::Extractor is a library that provides high level access to the text
|
15
|
+
objects of a PDF document.
|
16
|
+
email: unknown@email.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/pdf/extractor.rb
|
22
|
+
homepage: http://rubygems.org/gems/pdf-struct
|
23
|
+
licenses: []
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 1.8.16
|
43
|
+
signing_key:
|
44
|
+
specification_version: 3
|
45
|
+
summary: PDF::Extractor
|
46
|
+
test_files: []
|