pdf_reader 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ Manifest
2
+ Rakefile
3
+ README.rdoc
4
+ MIT-LICENSE
5
+ pdf_reader.gemspec
6
+ lib/pdf_reader.rb
7
+ spec/pdf_reader_spec.rb
8
+ spec/test.html
9
+ spec/test.pdf
@@ -0,0 +1,21 @@
1
+ = PDF Reader plugin
2
+
3
+ Gem that gives you the "power" to extract raw text from a PDF.
4
+
5
+ == Installation
6
+
7
+ Type:
8
+ sudo gem install finalist-pdf_reader
9
+
10
+ And... done!
11
+
12
+ == Usage
13
+
14
+ Create a new object and call +raw_text+:
15
+ p = PDFReader.new('test.pdf')
16
+ puts p.raw_text
17
+
18
+ ... Done!
19
+
20
+ == Credits
21
+ Copyright 2009, Diederick Lawson - Finalist IT Group. Released under the MIT License.
@@ -0,0 +1,23 @@
1
+ require 'rake'
2
+ require 'spec/rake/spectask'
3
+ require 'echoe'
4
+
5
+ Echoe.new('pdf_reader', '0.0.1') do |p|
6
+ p.description = "PDF reader"
7
+ p.url = "http://github.com/finalist/pdf_reader"
8
+ p.author = "Diederick Lawson"
9
+ p.email = "diederick@finalist.com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
15
+
16
+ desc 'Default: run specs.'
17
+ task :default => :spec
18
+
19
+ desc 'Run the specs'
20
+ Spec::Rake::SpecTask.new(:spec) do |t|
21
+ t.spec_opts = ['--colour --format progress --loadby mtime --reverse']
22
+ t.spec_files = FileList['spec/**/*_spec.rb']
23
+ end
@@ -0,0 +1,141 @@
1
+ require 'zlib'
2
+
3
+ class PDFReader
4
+ def initialize(file)
5
+ @file = file
6
+ end
7
+
8
+ # extracts all raw text
9
+ def raw_text
10
+ text = ''
11
+
12
+ objects = get_objects(get_data(@file))
13
+ objects = decode_objects(objects)
14
+
15
+ objects.each { |o| text += o[:data] if(o[:data] and is_text_header?(o[:header])) }
16
+
17
+ text
18
+ end
19
+
20
+ private
21
+
22
+ # decodes all given chunks to readable text
23
+ def decode_objects(objects)
24
+ objects.collect do |o|
25
+ if(o[:data])
26
+ o = inflate_object(o) if(is_object_deflated?(o))
27
+ o[:data] = postscript_to_text(o[:data])
28
+ end
29
+ end
30
+
31
+ objects.compact
32
+ end
33
+
34
+ # checks whetever the PDF header is a textual header or not
35
+ def is_text_header?(header)
36
+ not header.include?('/Subtype') and not header.match(/\/Length[1-3]+/)
37
+ end
38
+
39
+ # parse postscript
40
+ def postscript_to_text(data)
41
+ #result = ''
42
+ #get_block_data(data, /BT\s/, /\sET/, 3).each do |b|
43
+ # result += get_text(data)
44
+ #end
45
+ get_text(data)
46
+ end
47
+
48
+ # extracts text from a chunk
49
+ def get_text(chunk)
50
+ text_parts = get_block_data(chunk, /\(/, /\)/, 1)
51
+ result = ''
52
+
53
+ # everything between ()'s is text
54
+ unless text_parts.empty?
55
+ text_parts.each do |t|
56
+ result += t[1..-2]
57
+ end
58
+ end
59
+
60
+ return result
61
+ end
62
+
63
+ # deflates a chunk if it is compressed
64
+ def inflate_object(object)
65
+ z = Zlib::Inflate.new()
66
+ object[:data] = z.inflate(object[:data])
67
+ z.close
68
+
69
+ return object
70
+ end
71
+
72
+ # tests if a chunk is compressed
73
+ def is_object_deflated?(object)
74
+ object[:header].include?('FlateDecode')
75
+ end
76
+
77
+ # retrieve raw chunks from given string
78
+ def get_objects(data)
79
+ get_block_data(data, /obj/, /endobj/, 6).collect { |o| get_chunk(o) }.compact
80
+ end
81
+
82
+ # retrieves a chunk from an extracted object
83
+ def get_chunk(object)
84
+ chunk_headers = get_block_data(object, /<</, />>/, 2)
85
+
86
+ return { :header => chunk_headers[0], :data => get_stream_data(object) } unless(chunk_headers.empty?)
87
+ end
88
+
89
+ # retrieves data beteween "stream" and "endstream" word
90
+ def get_stream_data(object)
91
+ start_word = /stream(\n|\r\n)/
92
+ end_word = /endstream/
93
+ start_word_length = 7
94
+ end_word_length = 9
95
+ stream_objects = get_block_data(object, start_word, end_word, end_word_length)
96
+
97
+ unless(stream_objects.empty?)
98
+ start_index = stream_objects[0].index(start_word)
99
+ end_index = stream_objects[0].index(end_word)
100
+
101
+ # tricky stuff
102
+ start_word_length = 8 if(stream_objects[0][1..8] == "stream\r\n")
103
+
104
+ # extract 1 byte extra because endstream can have 1 random byte as end character (rtfm)
105
+ data = stream_objects[0].slice(start_word_length, stream_objects[0].length - start_word_length - end_word_length - 1)
106
+ end
107
+
108
+ return data unless data.nil?
109
+ end
110
+
111
+ # returns whole file contents
112
+ def get_data(file)
113
+ f = File.open(file, 'rb')
114
+ buffer = f.read
115
+ f.close
116
+
117
+ buffer
118
+ end
119
+
120
+ # retrieves objects from given string
121
+ def get_block_data(data, start_word, end_word, end_word_length)
122
+ objects = []
123
+ start_index = 0
124
+ end_index = 0
125
+
126
+ # this is a bit of cheesy... if someone knows a better way to do this?
127
+ while(not start_index.nil? and not end_index.nil?) do
128
+ start_index = data.index(start_word, end_index)
129
+
130
+ unless(start_index.nil?)
131
+ end_index = data.index(end_word, start_index)
132
+
133
+ unless(end_index.nil?)
134
+ objects.push(data.slice(start_index, end_index - start_index + end_word_length))
135
+ end
136
+ end
137
+ end
138
+
139
+ objects
140
+ end
141
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{pdf_reader}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Diederick Lawson"]
9
+ s.date = %q{2009-07-06}
10
+ s.description = %q{PDF Reader}
11
+ s.email = %q{diederick@finalist.com}
12
+ s.extra_rdoc_files = ["lib/pdf_reader.rb", "README.rdoc"]
13
+ s.files = ['Manifest', 'Rakefile', 'README.rdoc', 'MIT-LICENSE', 'pdf_reader.gemspec', 'lib/pdf_reader.rb', 'spec/pdf_reader_spec.rb', 'spec/test.html', 'spec/test.pdf']
14
+ s.homepage = %q{http://github.com/finalist/pdf_reader}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Pdf_reader", "--main", "README.rdoc"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{pdf_reader}
18
+ s.rubygems_version = %q{1.3.4}
19
+ s.summary = %q{PDF Reader}
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 3
24
+
25
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+ Dir.glob(File.join(File.dirname(__FILE__), '..', 'lib', '*.rb')).each { |f| require f }
4
+
5
+ describe PdfReaderSpec do
6
+ it "should extract raw text from pdf" do
7
+ p = PDFReader.new('test/test.pdf')
8
+ p.raw_text.should eql('Test PDF, try and read me!')
9
+ end
10
+ end
@@ -0,0 +1 @@
1
+ Test PDF, try and read me!
Binary file
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf_reader
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Diederick Lawson
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2009-07-06 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: PDF Reader
23
+ email: diederick@finalist.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - lib/pdf_reader.rb
30
+ - README.rdoc
31
+ files:
32
+ - Manifest
33
+ - Rakefile
34
+ - README.rdoc
35
+ - MIT-LICENSE
36
+ - pdf_reader.gemspec
37
+ - lib/pdf_reader.rb
38
+ - spec/pdf_reader_spec.rb
39
+ - spec/test.html
40
+ - spec/test.pdf
41
+ has_rdoc: true
42
+ homepage: http://github.com/finalist/pdf_reader
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --line-numbers
48
+ - --inline-source
49
+ - --title
50
+ - Pdf_reader
51
+ - --main
52
+ - README.rdoc
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ hash: 11
70
+ segments:
71
+ - 1
72
+ - 2
73
+ version: "1.2"
74
+ requirements: []
75
+
76
+ rubyforge_project: pdf_reader
77
+ rubygems_version: 1.6.2
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: PDF Reader
81
+ test_files: []
82
+