pdf-struct 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/pdf/extractor.rb +127 -0
  2. metadata +46 -0
@@ -0,0 +1,127 @@
1
+ require 'rexml/document'
2
+ require 'rexml/streamlistener'
3
+
4
+ module PDF
5
+ module Extractor
6
+ class ConversionError < RuntimeError; end
7
+ class MalformedPDFError < RuntimeError; end
8
+
9
+ def self.open(path)
10
+ input = `pdftohtml -enc UTF-8 -xml -stdout #{path} 2>&1`
11
+ case input
12
+ #when /command not found/
13
+ # raise ConversionError, 'pdftohtml command not found'
14
+ when /PDF file is damaged/
15
+ raise MalformedPDFError, "the PDF with filename '#{path}' is malformed"
16
+ when /Couldn't open file/
17
+ raise RuntimeError, "Couldn't open file: '#{path}'"
18
+ else
19
+ PDF::Extractor::Document.new(input)
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ class PDF::Extractor::Element
26
+ attr_reader :top, :left, :width, :height, :font
27
+ attr_accessor :content
28
+
29
+ def initialize(params = {})
30
+ @top = params[:top]
31
+ @left = params[:left]
32
+ @width = params[:width]
33
+ @height = params[:height]
34
+ @font = params[:font]
35
+ @content = params[:content]
36
+ end
37
+ end
38
+
39
+ class PDF::Extractor::Font
40
+ attr_reader :id, :name, :size
41
+ attr_accessor :style
42
+
43
+ def initialize(params = {})
44
+ @id = params[:id]
45
+ @size = params[:size].to_f
46
+ @name = params[:name]
47
+ @style = :normal
48
+ end
49
+
50
+ def normal?; @style == :normal end
51
+ def bold?; @style == :bold end
52
+ def italic?; @style == :italic end
53
+ end
54
+
55
+ class PDF::Extractor::Page
56
+ attr_reader :elements, :width, :height
57
+
58
+ def initialize(params = {})
59
+ @width = params[:width]
60
+ @height = params[:height]
61
+ @elements = []
62
+ end
63
+ end
64
+
65
+ class PDF::Extractor::Reader
66
+ include REXML::StreamListener
67
+
68
+ attr_reader :pages, :fonts
69
+
70
+ def initialize
71
+ @pages, @fonts = [], []
72
+ end
73
+
74
+ def tag_start(name, attributes)
75
+ @in_text = false
76
+ case name
77
+ when 'page'
78
+ @pages << PDF::Extractor::Page.new(
79
+ :width => attributes['width'].to_f,
80
+ :height => attributes['height'].to_f
81
+ )
82
+ when 'fontspec'
83
+ @fonts << PDF::Extractor::Font.new(
84
+ :id => attributes['id'],
85
+ :size => attributes['size'].to_f + 2, # is this right?
86
+ :name => attributes['family']
87
+ )
88
+ when 'text'
89
+ @in_text = true
90
+ @pages.last.elements << PDF::Extractor::Element.new(
91
+ :top => attributes['top'].to_f,
92
+ :left => attributes['left'].to_f,
93
+ :width => attributes['width'].to_f,
94
+ :height => attributes['height'].to_f,
95
+ :font => @fonts.find{|n| n.id == attributes['font']}
96
+ )
97
+ when 'b'
98
+ @in_text = true
99
+ @pages.last.elements.last.font.style = :bold
100
+ when 'i'
101
+ @in_text = true
102
+ @pages.last.elements.last.font.style = :italic
103
+ end
104
+ end
105
+
106
+ def text(str)
107
+ @pages.last.elements.last.content = str if @in_text and str =~ /\S/
108
+ end
109
+ end
110
+
111
+ class PDF::Extractor::Document
112
+ attr_reader :pages
113
+
114
+ def initialize(source)
115
+ populate source
116
+ end
117
+
118
+ def elements; @pages.map{|n| n.elements}.flatten end
119
+
120
+ private
121
+
122
+ def populate(source)
123
+ listener = PDF::Extractor::Reader.new
124
+ REXML::Parsers::StreamParser.new(source, listener).parse
125
+ @pages, @fonts = listener.pages, listener.fonts
126
+ end
127
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-struct
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Erik Terpstra
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-10 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: PDF::Extractor is a library that provides high level access to the text
15
+ objects of a PDF document.
16
+ email: unknown@email.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/pdf/extractor.rb
22
+ homepage: http://rubygems.org/gems/pdf-struct
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.16
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: PDF::Extractor
46
+ test_files: []