pdfbeads 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,160 @@
1
+ # encoding: UTF-8
2
+
3
+ ######################################################################
4
+ #
5
+ # PDFBeads -- convert scanned images to a single PDF file
6
+ # Version 1.0
7
+ #
8
+ # Unlike other PDF creation tools, this utility attempts to implement
9
+ # the approach typically used for DjVu books. Its key feature is
10
+ # separating scanned text (typically black, but indexed images with
11
+ # a small number of colors are also accepted) from halftone images
12
+ # placed into a background layer.
13
+ #
14
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
15
+ # All rights reserved.
16
+ #
17
+ # This program is free software; you can redistribute it and/or modify
18
+ # it under the terms of the GNU General Public License as published by
19
+ # the Free Software Foundation; either version 2 of the License, or
20
+ # (at your option) any later version.
21
+ #
22
+ # This program is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
+ # GNU General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU General Public License
28
+ # along with this program; if not, write to the Free Software
29
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30
+ #
31
+ #######################################################################
32
+
33
+ # Read table of contents from an UTF-8 text file and prepare it for
34
+ # placing into a PDF document. The syntax of the TOC file is simple.
35
+ # Each line describes a single outline item according to the following
36
+ # pattern:
37
+ #
38
+ # <indent>"Title" "Page Number" [0|-|1|+]
39
+ #
40
+ # The indent is used to determine the level of this outline item: it may
41
+ # consist either of spaces or of tabs, but it is not allowed to
42
+ # mix both characters in the same file. The title and page number are
43
+ # separated with an arbitrary number of whitespace characters and are
44
+ # normally enclosed into double quotes. The third, optional argument
45
+ # specifies if this TOC item should be displayed unfolded by default
46
+ # (i. e. if its descendants should be visible).
47
+ #
48
+ # The reference to a TOC file can be passed to pdfbeads via the *-C*
49
+ # (or *--toc*) option. It is recommended to use this option in combination
50
+ # with the *-L* (or *--labels*) parameter, which allows to specify an
51
+ # alternate page numbering for a PDF file: thus your TOC file may
52
+ # contain the same page numbers, as the original book, so that there is
53
+ # no need to care about any numbering offsets.
54
+
55
+ class PDFBeads::PDFBuilder::PDFTOC < Array
56
+ # This class represents a single TOC item with its parent node and
57
+ # next/previous siblings.
58
+ class PDFTOCItem < Hash
59
+ def getChildrenCount()
60
+ cnt = self[:children].length
61
+ self[:children].each do |child|
62
+ if child[:open] and child[:children].length > 0
63
+ cnt = cnt + child.getChildrenCount
64
+ end
65
+ end
66
+ return cnt
67
+ end
68
+
69
+ def prevSibling( indent )
70
+ if has_key? :parent and self[:indent] > indent
71
+ return self[:parent].prevSibling( indent )
72
+ end
73
+
74
+ return self if self[:indent] == indent
75
+ return nil
76
+ end
77
+ end
78
+
79
+ def initialize( fpath )
80
+ root = PDFTOCItem[
81
+ :indent => -1,
82
+ :open => true,
83
+ :children => Array.new()
84
+ ]
85
+ push( root )
86
+ parseTOC( fpath,root )
87
+ end
88
+
89
+ private
90
+
91
+ def parseTOC( path,root )
92
+ File.open( path,'r' ) do |fin|
93
+ fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
94
+ prev = root
95
+ indent_char = "\x00"
96
+ fin.each do |fl|
97
+ next if /^\#/.match( fl )
98
+
99
+ parts = fl.scan(/".*?"|\S+/)
100
+ if parts.length > 1
101
+ title = parts[0].gsub(/\A"/m,"").gsub(/"\Z/m, "")
102
+ ref = parts[1].gsub(/\A"/m,"").gsub(/"\Z/m, "")
103
+ begin
104
+ title = Iconv.iconv( "utf-16be", "utf-8", title ).first
105
+ rescue
106
+ $stderr.puts("Error: TOC should be specified in utf-8")
107
+ return
108
+ end
109
+
110
+ entry = PDFTOCItem[
111
+ :title => title,
112
+ :ref => ref,
113
+ :indent => 0,
114
+ :children => Array.new()
115
+ ]
116
+ if /^([ \t]+)/.match(fl)
117
+ indent = $1
118
+ indent.each_byte do |char|
119
+ if indent_char == "\x00"
120
+ indent_char = char
121
+ elsif not char.eql? indent_char
122
+ $stderr.puts("Error: you should not mix spaces and tabs in TOC indents\n")
123
+ return
124
+ end
125
+ end
126
+
127
+ entry[:indent] = indent.length
128
+ end
129
+
130
+ if entry[:indent] < prev[:indent]
131
+ prev = prev.prevSibling( entry[:indent] )
132
+ end
133
+ if prev.nil?
134
+ $stderr.puts("Error: a TOC item seems to have a wrong indent\n")
135
+ return
136
+ end
137
+
138
+ if entry[:indent] == prev[:indent]
139
+ entry[:parent] = prev[:parent]
140
+ entry[:parent][:children].push( entry )
141
+ entry[:prev] = prev
142
+ prev[:next] = entry
143
+ elsif entry[:indent] > prev[:indent]
144
+ entry[:parent] = prev
145
+ prev[:children].push(entry)
146
+ end
147
+
148
+ if parts.length > 2 and (parts[2] == '+' or parts[2] == '1')
149
+ entry[:open] = true
150
+ else
151
+ entry[:open] = false
152
+ end
153
+
154
+ push( entry )
155
+ prev = entry
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdfbeads
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Alexey Kryukov
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-07 00:00:00 +03:00
19
+ default_executable: pdfbeads
20
+ dependencies: []
21
+
22
+ description: " PDFBeads is a small utility written in Ruby which takes scanned\n page images and converts them into a single PDF file. Unlike other\n PDF creation tools, PDFBeads attempts to implement the approach\n typically used for DjVu books. Its key feature is separating scanned\n text (typically black, but indexed images with a small number of\n colors are also accepted) from halftone pictures. Each type of\n graphical data is encoded into its own layer with a specific\n compression method and resolution.\n"
23
+ email: amkryukov@gmail.com
24
+ executables:
25
+ - pdfbeads
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ - COPYING
31
+ - ChangeLog
32
+ files:
33
+ - lib/pdfbeads/pdfbuilder.rb
34
+ - lib/pdfbeads/pdfpage.rb
35
+ - lib/pdfbeads/pdftoc.rb
36
+ - lib/pdfbeads/pdffont.rb
37
+ - lib/pdfbeads/pdflabels.rb
38
+ - lib/pdfbeads/pdfdoc.rb
39
+ - lib/pdfbeads.rb
40
+ - lib/imageinspector.rb
41
+ - bin/pdfbeads
42
+ - doc/pdfbeads.ru.html
43
+ - README
44
+ - COPYING
45
+ - ChangeLog
46
+ has_rdoc: true
47
+ homepage: http://pdfbeads.rubyforge.org
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options: []
52
+
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ hash: 3
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ requirements:
74
+ - RMagick, v2.13.0 or greater
75
+ - Hpricot, v0.8.3 or greater
76
+ rubyforge_project: PDFBeads
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: PDFBeads -- convert scanned images to a single PDF file.
81
+ test_files: []
82
+