pdf_reader 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ Manifest
2
+ Rakefile
3
+ README.rdoc
4
+ MIT-LICENSE
5
+ pdf_reader.gemspec
6
+ lib/pdf_reader.rb
7
+ spec/pdf_reader_spec.rb
8
+ spec/test.html
9
+ spec/test.pdf
@@ -0,0 +1,21 @@
1
+ = PDF Reader plugin
2
+
3
+ Gem that gives you the "power" to extract raw text from a PDF.
4
+
5
+ == Installation
6
+
7
+ Type:
8
+ sudo gem install finalist-pdf_reader
9
+
10
+ And... done!
11
+
12
+ == Usage
13
+
14
+ Create a new object and call +raw_text+:
15
+ p = PDFReader.new('test.pdf')
16
+ puts p.raw_text
17
+
18
+ ... Done!
19
+
20
+ == Credits
21
+ Copyright 2009, Diederick Lawson - Finalist IT Group. Released under the MIT License.
@@ -0,0 +1,23 @@
1
+ require 'rake'
2
+ require 'spec/rake/spectask'
3
+ require 'echoe'
4
+
5
+ Echoe.new('pdf_reader', '0.0.1') do |p|
6
+ p.description = "PDF reader"
7
+ p.url = "http://github.com/finalist/pdf_reader"
8
+ p.author = "Diederick Lawson"
9
+ p.email = "diederick@finalist.com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
15
+
16
+ desc 'Default: run specs.'
17
+ task :default => :spec
18
+
19
+ desc 'Run the specs'
20
+ Spec::Rake::SpecTask.new(:spec) do |t|
21
+ t.spec_opts = ['--colour --format progress --loadby mtime --reverse']
22
+ t.spec_files = FileList['spec/**/*_spec.rb']
23
+ end
@@ -0,0 +1,141 @@
1
+ require 'zlib'
2
+
3
+ class PDFReader
4
+ def initialize(file)
5
+ @file = file
6
+ end
7
+
8
+ # extracts all raw text
9
+ def raw_text
10
+ text = ''
11
+
12
+ objects = get_objects(get_data(@file))
13
+ objects = decode_objects(objects)
14
+
15
+ objects.each { |o| text += o[:data] if(o[:data] and is_text_header?(o[:header])) }
16
+
17
+ text
18
+ end
19
+
20
+ private
21
+
22
+ # decodes all given chunks to readable text
23
+ def decode_objects(objects)
24
+ objects.collect do |o|
25
+ if(o[:data])
26
+ o = inflate_object(o) if(is_object_deflated?(o))
27
+ o[:data] = postscript_to_text(o[:data])
28
+ end
29
+ end
30
+
31
+ objects.compact
32
+ end
33
+
34
+ # checks whetever the PDF header is a textual header or not
35
+ def is_text_header?(header)
36
+ not header.include?('/Subtype') and not header.match(/\/Length[1-3]+/)
37
+ end
38
+
39
+ # parse postscript
40
+ def postscript_to_text(data)
41
+ #result = ''
42
+ #get_block_data(data, /BT\s/, /\sET/, 3).each do |b|
43
+ # result += get_text(data)
44
+ #end
45
+ get_text(data)
46
+ end
47
+
48
+ # extracts text from a chunk
49
+ def get_text(chunk)
50
+ text_parts = get_block_data(chunk, /\(/, /\)/, 1)
51
+ result = ''
52
+
53
+ # everything between ()'s is text
54
+ unless text_parts.empty?
55
+ text_parts.each do |t|
56
+ result += t[1..-2]
57
+ end
58
+ end
59
+
60
+ return result
61
+ end
62
+
63
+ # deflates a chunk if it is compressed
64
+ def inflate_object(object)
65
+ z = Zlib::Inflate.new()
66
+ object[:data] = z.inflate(object[:data])
67
+ z.close
68
+
69
+ return object
70
+ end
71
+
72
+ # tests if a chunk is compressed
73
+ def is_object_deflated?(object)
74
+ object[:header].include?('FlateDecode')
75
+ end
76
+
77
+ # retrieve raw chunks from given string
78
+ def get_objects(data)
79
+ get_block_data(data, /obj/, /endobj/, 6).collect { |o| get_chunk(o) }.compact
80
+ end
81
+
82
+ # retrieves a chunk from an extracted object
83
+ def get_chunk(object)
84
+ chunk_headers = get_block_data(object, /<</, />>/, 2)
85
+
86
+ return { :header => chunk_headers[0], :data => get_stream_data(object) } unless(chunk_headers.empty?)
87
+ end
88
+
89
+ # retrieves data beteween "stream" and "endstream" word
90
+ def get_stream_data(object)
91
+ start_word = /stream(\n|\r\n)/
92
+ end_word = /endstream/
93
+ start_word_length = 7
94
+ end_word_length = 9
95
+ stream_objects = get_block_data(object, start_word, end_word, end_word_length)
96
+
97
+ unless(stream_objects.empty?)
98
+ start_index = stream_objects[0].index(start_word)
99
+ end_index = stream_objects[0].index(end_word)
100
+
101
+ # tricky stuff
102
+ start_word_length = 8 if(stream_objects[0][1..8] == "stream\r\n")
103
+
104
+ # extract 1 byte extra because endstream can have 1 random byte as end character (rtfm)
105
+ data = stream_objects[0].slice(start_word_length, stream_objects[0].length - start_word_length - end_word_length - 1)
106
+ end
107
+
108
+ return data unless data.nil?
109
+ end
110
+
111
+ # returns whole file contents
112
+ def get_data(file)
113
+ f = File.open(file, 'rb')
114
+ buffer = f.read
115
+ f.close
116
+
117
+ buffer
118
+ end
119
+
120
+ # retrieves objects from given string
121
+ def get_block_data(data, start_word, end_word, end_word_length)
122
+ objects = []
123
+ start_index = 0
124
+ end_index = 0
125
+
126
+ # this is a bit of cheesy... if someone knows a better way to do this?
127
+ while(not start_index.nil? and not end_index.nil?) do
128
+ start_index = data.index(start_word, end_index)
129
+
130
+ unless(start_index.nil?)
131
+ end_index = data.index(end_word, start_index)
132
+
133
+ unless(end_index.nil?)
134
+ objects.push(data.slice(start_index, end_index - start_index + end_word_length))
135
+ end
136
+ end
137
+ end
138
+
139
+ objects
140
+ end
141
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{pdf_reader}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Diederick Lawson"]
9
+ s.date = %q{2009-07-06}
10
+ s.description = %q{PDF Reader}
11
+ s.email = %q{diederick@finalist.com}
12
+ s.extra_rdoc_files = ["lib/pdf_reader.rb", "README.rdoc"]
13
+ s.files = ['Manifest', 'Rakefile', 'README.rdoc', 'MIT-LICENSE', 'pdf_reader.gemspec', 'lib/pdf_reader.rb', 'spec/pdf_reader_spec.rb', 'spec/test.html', 'spec/test.pdf']
14
+ s.homepage = %q{http://github.com/finalist/pdf_reader}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Pdf_reader", "--main", "README.rdoc"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{pdf_reader}
18
+ s.rubygems_version = %q{1.3.4}
19
+ s.summary = %q{PDF Reader}
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 3
24
+
25
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+ Dir.glob(File.join(File.dirname(__FILE__), '..', 'lib', '*.rb')).each { |f| require f }
4
+
5
+ describe PdfReaderSpec do
6
+ it "should extract raw text from pdf" do
7
+ p = PDFReader.new('test/test.pdf')
8
+ p.raw_text.should eql('Test PDF, try and read me!')
9
+ end
10
+ end
@@ -0,0 +1 @@
1
+ Test PDF, try and read me!
Binary file
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf_reader
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Diederick Lawson
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2009-07-06 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: PDF Reader
23
+ email: diederick@finalist.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - lib/pdf_reader.rb
30
+ - README.rdoc
31
+ files:
32
+ - Manifest
33
+ - Rakefile
34
+ - README.rdoc
35
+ - MIT-LICENSE
36
+ - pdf_reader.gemspec
37
+ - lib/pdf_reader.rb
38
+ - spec/pdf_reader_spec.rb
39
+ - spec/test.html
40
+ - spec/test.pdf
41
+ has_rdoc: true
42
+ homepage: http://github.com/finalist/pdf_reader
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --line-numbers
48
+ - --inline-source
49
+ - --title
50
+ - Pdf_reader
51
+ - --main
52
+ - README.rdoc
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ hash: 11
70
+ segments:
71
+ - 1
72
+ - 2
73
+ version: "1.2"
74
+ requirements: []
75
+
76
+ rubyforge_project: pdf_reader
77
+ rubygems_version: 1.6.2
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: PDF Reader
81
+ test_files: []
82
+