pdfshaver 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5a34c73145fff7cd843f8770b4cdb14648b0d8e8
4
+ data.tar.gz: 543a70e710dc09646580d1bb0fff7bce6f41fe5e
5
+ SHA512:
6
+ metadata.gz: 3bca072b82f7710883dec16c6be56fcd696ef074c40b55ca1139f3671985ea7690fad1f2e7cfb0333f4ec480d1b92b2c722e3fa990a1e7a8b19f6b2bbc814ac0
7
+ data.tar.gz: 3ec48a4b055c0b985567f5922194dbd853a489e392f64cf5ee9b641dd3e45ce9641b2028b4b93d6d802c10516b8cfb4e37575e73c29d27d6a831c31f0a551bf1
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/extensiontask'
3
+
4
+ Rake::ExtensionTask.new('pdfium_ruby')
5
+
6
+ task :test => :compile do
7
+ Dir.glob(File.join File.dirname(__FILE__), "test", "**", "*_spec.rb").each{ |test| require test }
8
+ end
9
+
10
+ task(default: :test)
data/Readme.md ADDED
@@ -0,0 +1,14 @@
1
+ # PDFShaver
2
+
3
+ Shave pages off of PDFs as images
4
+
5
+ ### N.B. This is a work in process
6
+
7
+ ### Examples
8
+
9
+ require 'pdfshaver'
10
+ document = PDFShaver::Document.new('./path/to/document.pdf')
11
+ landscape_pages = document.pages.select{ |page| page.aspect > 1 }
12
+ landscape_pages.each{ |page| page.render("./page_#{page.number}.gif") }
13
+
14
+ copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
@@ -0,0 +1,131 @@
1
+ #include "document.h"
2
+
3
+ /********************************************
4
+ * C++ Document definition
5
+ *********************************************/
6
+ Document::Document() {
7
+ // Initialize state variables
8
+ // to mark whether document has been used
9
+ // and whether document is freeable.
10
+ this->opened = false;
11
+ this->ready_to_be_freed = true;
12
+ }
13
+
14
+ Document::~Document() {
15
+ // make sure the document exists and was initialized before
16
+ // trying to close it.
17
+ if (this->opened) { FPDF_CloseDocument(this->fpdf_document); }
18
+ }
19
+
20
+ int Document::load(VALUE path) {
21
+ // load the document via PDFium.
22
+ // returns false if loading document fails.
23
+ this->fpdf_document = FPDF_LoadDocument(StringValuePtr(path), NULL);
24
+ int parse_status = FPDF_GetLastError();
25
+ // indicate that Ruby is still using this document.
26
+ this->opened = this->isValid();
27
+ this->ready_to_be_freed = false;
28
+ return parse_status;
29
+ }
30
+
31
+ int Document::length() { return FPDF_GetPageCount(this->fpdf_document); }
32
+
33
+ bool Document::isValid() { return !!(this->fpdf_document); }
34
+
35
+ void Document::flagDocumentAsReadyForRelease() { this->ready_to_be_freed = true; }
36
+
37
+
38
+ void Document::notifyPageOpened(Page* page) {
39
+ //ruby_puts_cstring("Adding page to open pages");
40
+ this->open_pages.insert(page);
41
+ }
42
+ void Document::notifyPageClosed(Page* page) {
43
+ //ruby_puts_cstring("Removing page from open pages");
44
+ this->open_pages.erase(page);
45
+ this->destroyUnlessPagesAreOpen();
46
+ }
47
+ void Document::destroyUnlessPagesAreOpen() {
48
+ // once the document is no longer being used, and none of its child pages are open
49
+ // it's safe to destroy.
50
+ if (!(this->opened) || (this->opened && this->ready_to_be_freed && this->open_pages.empty())) {
51
+ //ruby_puts_cstring("Deleting Document");
52
+ delete this;
53
+ }
54
+ }
55
+
56
+ /********************************************
57
+ * Ruby class definition and initialization
58
+ *********************************************/
59
+
60
+ void Define_Document() {
61
+ // Get the PDFShaver namespace and get the `Document` class inside it.
62
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
63
+ VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
64
+
65
+ rb_define_alloc_func(rb_PDFShaver_Document, *document_allocate);
66
+
67
+ rb_define_private_method(rb_PDFShaver_Document, "open_document_with_pdfium",
68
+ CPP_RUBY_METHOD_FUNC(initialize_document_internals), -1);
69
+ };
70
+
71
+ VALUE document_allocate(VALUE rb_PDFShaver_Document) {
72
+ Document* document = new Document();
73
+ return Data_Wrap_Struct(rb_PDFShaver_Document, NULL, destroy_document_when_safe, document);
74
+ }
75
+
76
+ // Entry point for PDFShaver::Document's ruby initializer into C++ land
77
+ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
78
+ // Get the PDFShaver namespace and get the `Document` class inside it.
79
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
80
+ VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
81
+
82
+ // use Ruby's argument scanner to pull out a required
83
+ // `path` argument and an optional `options` hash.
84
+ VALUE path, options;
85
+ int number_of_args = rb_scan_args(arg_count, args, "11", &path, &options);
86
+
87
+ // attempt to open document.
88
+ // path should at this point be validated & known to exist.
89
+ Document* document;
90
+ Data_Get_Struct(self, Document, document);
91
+ int parse_status = document->load(path);
92
+ //document_handle_parse_status(parse_status, path);
93
+ if (!document->isValid()) { rb_raise(rb_eArgError, "failed to open file (%" PRIsVALUE")", path); }
94
+
95
+ // get the document length and store it as an instance variable on the class.
96
+ rb_ivar_set(self, rb_intern("@length"), INT2FIX(document->length()));
97
+ return self;
98
+ }
99
+
100
+ void document_handle_parse_status(int status, VALUE path) {
101
+ //printf("\nSTATUS: %d\n", status);
102
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
103
+ VALUE rb_eEncryptionError = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
104
+ VALUE rb_eInvalidFormatError = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
105
+ VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
106
+
107
+ //switch (status) {
108
+ // case PDFPARSE_ERROR_SUCCESS:
109
+ // break;
110
+ // case PDFPARSE_ERROR_FILE:
111
+ // rb_raise(rb_eArgError, "unable to open file (%" PRIsVALUE")", path);
112
+ // break;
113
+ // case PDFPARSE_ERROR_FORMAT:
114
+ // rb_raise(rb_eInvalidFormatError, "file (%" PRIsVALUE") is not a valid PDF", path);
115
+ // break;
116
+ // case PDFPARSE_ERROR_PASSWORD:
117
+ // rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
118
+ // break;
119
+ // case PDFPARSE_ERROR_CERT:
120
+ // rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
121
+ // break;
122
+ // case PDFPARSE_ERROR_HANDLER:
123
+ // rb_raise(rb_eMissingHandlerError, "could not find handler for media objects in file (%" PRIsVALUE")", path);
124
+ // break;
125
+ //}
126
+ }
127
+
128
+ static void destroy_document_when_safe(Document* document) {
129
+ document->flagDocumentAsReadyForRelease();
130
+ document->destroyUnlessPagesAreOpen();
131
+ }
@@ -0,0 +1,53 @@
1
+ #ifndef __DOCUMENT_H__
2
+ #define __DOCUMENT_H__
3
+
4
+ // forward declaration since Page/Document classes are interdependent
5
+ class Page;
6
+ #include "pdfium_ruby.h"
7
+ #include "fpdf_ext.h"
8
+ //#include "core/include/fpdfapi/fpdf_parser.h"
9
+ #include "page.h"
10
+ #include <unordered_set>
11
+
12
+ // C++ Class to wrap lifecycle of
13
+ // PDF Documents opened through PDFium.
14
+ class Document {
15
+ public:
16
+ FPDF_DOCUMENT fpdf_document;
17
+
18
+ // constructor
19
+ Document();
20
+
21
+ int load(VALUE path);
22
+
23
+ // wrapper for PDFium's pageCount
24
+ int length();
25
+
26
+ bool isValid();
27
+
28
+ // flag to set instances as ready to be disposed of
29
+ // pending ensuring all its pages have been first closed.
30
+ void flagDocumentAsReadyForRelease();
31
+
32
+ // a guard for the destructor.
33
+ void destroyUnlessPagesAreOpen();
34
+
35
+ void notifyPageClosed(Page* page);
36
+ void notifyPageOpened(Page* page);
37
+
38
+ // destructor
39
+ ~Document();
40
+
41
+ private:
42
+ //bool subscribeToPage(Page* page);
43
+ bool opened;
44
+ bool ready_to_be_freed;
45
+ std::unordered_set<Page*> open_pages;
46
+ };
47
+
48
+ static void destroy_document_when_safe(Document* document);
49
+
50
+ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self);
51
+ VALUE document_allocate(VALUE rb_PDFShaver_Document);
52
+ void document_handle_parse_status(int status, VALUE path);
53
+ #endif // __DOCUMENT_H__
@@ -0,0 +1,68 @@
1
+ require "mkmf"
2
+ require 'rbconfig'
3
+ # List directories to search for PDFium headers and library files to link against
4
+ def append_pdfium_directory_to paths
5
+ paths.map do |dir|
6
+ [
7
+ File.join(dir, 'pdfium'),
8
+ File.join(dir, 'pdfium', 'fpdfsdk', 'include'),
9
+ File.join(dir, 'pdfium', 'third_party', 'base', 'numerics')
10
+ ]
11
+ end.flatten + paths
12
+ end
13
+
14
+ LIB_DIRS = append_pdfium_directory_to %w[
15
+ /usr/local/lib/
16
+ /usr/lib/
17
+ ]
18
+ HEADER_DIRS = append_pdfium_directory_to %w[
19
+ /usr/local/include/
20
+ /usr/include/
21
+ ]
22
+
23
+ # Tell ruby we want to search in the specified paths
24
+ dir_config("pdfium", HEADER_DIRS, LIB_DIRS)
25
+
26
+ LIB_FILES= %w[
27
+ javascript
28
+ bigint
29
+ freetype
30
+ fpdfdoc
31
+ fpdftext
32
+ formfiller
33
+ icudata
34
+ icuuc
35
+ icui18n
36
+ v8_libbase
37
+ v8_base
38
+ v8_snapshot
39
+ v8_libplatform
40
+ jsapi
41
+ pdfwindow
42
+ fxedit
43
+ fxcrt
44
+ fxcodec
45
+ fpdfdoc
46
+ fdrm
47
+ fxge
48
+ fpdfapi
49
+ freetype
50
+ pdfium
51
+ pthread
52
+ freeimage
53
+ ]
54
+
55
+ LIB_FILES.each do | lib |
56
+ have_library(lib) or abort "Couldn't find library lib#{lib} in #{LIB_DIRS.join(', ')}"
57
+ end
58
+
59
+ $CPPFLAGS += " -fPIC -std=c++11"
60
+ if RUBY_PLATFORM =~ /darwin/
61
+ have_library('objc')
62
+ FRAMEWORKS = %w{AppKit CoreFoundation}
63
+ $LDFLAGS << FRAMEWORKS.map { |f| " -framework #{f}" }.join
64
+ else
65
+ #$CPPFLAGS += " -fPIC -std=c++11"
66
+ end
67
+
68
+ create_makefile "pdfium_ruby"
@@ -0,0 +1,190 @@
1
+ #include "page.h"
2
+ #include <FreeImage.h>
3
+
4
+ /********************************************
5
+ * C++ Page definition
6
+ *********************************************/
7
+
8
+ Page::Page() { this->opened = false; }
9
+
10
+ bool Page::load(Document* document, int page_index) {
11
+ this->document = document;
12
+ this->page_index = page_index;
13
+
14
+ this->fpdf_page = FPDF_LoadPage(document->fpdf_document, page_index);
15
+ document->notifyPageOpened(this);
16
+ this->opened = true;
17
+ return this->opened;
18
+ }
19
+
20
+ double Page::width(){ return FPDF_GetPageWidth(this->fpdf_page); }
21
+ double Page::height(){ return FPDF_GetPageHeight(this->fpdf_page); }
22
+ double Page::aspect() { return width() / height(); }
23
+
24
+ bool Page::render(char* path, int width, int height) {
25
+ // If no height or width is supplied, render at natural dimensions.
26
+ if (!width && !height) {
27
+ width = (int)(this->width());
28
+ height = (int)(this->height());
29
+ }
30
+ // When given only a height or a width,
31
+ // infer the other by preserving page aspect ratio.
32
+ if ( width && !height) { height = width / this->aspect(); }
33
+ if (!width && height) { width = height * this->aspect(); }
34
+ //printf("Derp? %d, %d\n", width, height);
35
+
36
+ // Create bitmap. width, height, alpha 1=enabled,0=disabled
37
+ bool alpha = false;
38
+ FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, alpha);
39
+ if (!bitmap) { return false; }
40
+
41
+ // fill all pixels with white for the background color
42
+ FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF);
43
+
44
+ // Render a page to a bitmap in RGBA format
45
+ // args are: *buffer, page, start_x, start_y, size_x, size_y, rotation, and flags
46
+ // flags are:
47
+ // 0 for normal display, or combination of flags defined below
48
+ // 0x01 Set if annotations are to be rendered
49
+ // 0x02 Set if using text rendering optimized for LCD display
50
+ // 0x04 Set if you don't want to use GDI+
51
+ int start_x = 0;
52
+ int start_y = 0;
53
+ int rotation = 0;
54
+ int flags = FPDF_PRINTING; // A flag defined in PDFium's codebase.
55
+ FPDF_RenderPageBitmap(bitmap, this->fpdf_page, start_x, start_y, width, height, rotation, flags);
56
+
57
+ // The stride holds the width of one row in bytes. It may not be an exact
58
+ // multiple of the pixel width because the data may be packed to always end on a byte boundary
59
+ int stride = FPDFBitmap_GetStride(bitmap);
60
+
61
+ // Safety checks to make sure that the bitmap
62
+ // is properly sized and can be safely manipulated
63
+ bool bitmapIsntValid = (
64
+ (stride < 0) ||
65
+ (width > INT_MAX / height) ||
66
+ ((stride * height) > (INT_MAX / 3))
67
+ );
68
+ if (bitmapIsntValid){
69
+ FPDFBitmap_Destroy(bitmap);
70
+ return false;
71
+ }
72
+
73
+ // Read the FPDF bitmap into a FreeImage bitmap.
74
+ unsigned bpp = 32;
75
+ unsigned red_mask = 0xFF0000;
76
+ unsigned green_mask = 0x00FF00;
77
+ unsigned blue_mask = 0x0000FF;
78
+ bool topdown = true;
79
+ FIBITMAP *raw = FreeImage_ConvertFromRawBits(
80
+ (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp, red_mask, green_mask, blue_mask, topdown);
81
+
82
+ // at this point we're done with the FPDF bitmap and can destroy it.
83
+ FPDFBitmap_Destroy(bitmap);
84
+
85
+ // Conversion to jpg or gif require that the bpp be set to 24
86
+ // since we're not exporting using alpha transparency above in FPDFBitmap_Create
87
+ FIBITMAP *image = FreeImage_ConvertTo24Bits(raw);
88
+ FreeImage_Unload(raw);
89
+
90
+ // figure out the desired format from the file extension
91
+ FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(path);
92
+
93
+ bool success = false;
94
+ if ( FIF_GIF == format ){
95
+ // Gif requires quantization to drop to 8bpp
96
+ FIBITMAP *gif = FreeImage_ColorQuantize(image, FIQ_WUQUANT);
97
+ success = FreeImage_Save(FIF_GIF, gif, path, GIF_DEFAULT);
98
+ FreeImage_Unload(gif);
99
+ } else {
100
+ // All other formats should be just a save call
101
+ success = FreeImage_Save(format, image, path, 0);
102
+ }
103
+
104
+ // unload the image
105
+ FreeImage_Unload(image);
106
+
107
+ return success;
108
+ }
109
+
110
+ Page::~Page() {
111
+ if (this->opened) {
112
+ FPDF_ClosePage(this->fpdf_page);
113
+ this->document->notifyPageClosed(this);
114
+ }
115
+ }
116
+
117
+ /********************************************
118
+ * Ruby class definition and initialization
119
+ *********************************************/
120
+
121
+ void Define_Page() {
122
+ // Get the PDFShaver namespace and get the `Page` class inside it.
123
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
124
+ VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
125
+
126
+ rb_define_alloc_func(rb_PDFShaver_Page, *page_allocate);
127
+
128
+ rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
129
+ rb_define_private_method(rb_PDFShaver_Page, "initialize_page_internals",
130
+ CPP_RUBY_METHOD_FUNC(initialize_page_internals),-1);
131
+ }
132
+
133
+ VALUE page_allocate(VALUE rb_PDFShaver_Page) {
134
+ Page* page = new Page();
135
+ return Data_Wrap_Struct(rb_PDFShaver_Page, NULL, destroy_page, page);
136
+ }
137
+
138
+ //bool page_render(int arg_count, VALUE* args, VALUE self) {
139
+ VALUE page_render(int arg_count, VALUE* args, VALUE self) {
140
+ VALUE path, options;
141
+ int width = 0, height = 0;
142
+
143
+ int number_of_args = rb_scan_args(arg_count, args, "1:", &path, &options);
144
+ if (arg_count > 1) {
145
+ VALUE rb_width = rb_hash_aref(options, ID2SYM(rb_intern("width")));
146
+ VALUE rb_height = rb_hash_aref(options, ID2SYM(rb_intern("height")));
147
+
148
+ if (!(NIL_P(rb_width))) {
149
+ if (FIXNUM_P(rb_width)) { width = FIX2INT(rb_width); }
150
+ else { rb_raise(rb_eArgError, ":width must be a integer"); }
151
+ }
152
+ if (!(NIL_P(rb_height))) {
153
+ if (FIXNUM_P(rb_height)) { height = FIX2INT(rb_height); }
154
+ else { rb_raise(rb_eArgError, ":height must be a integer"); }
155
+ }
156
+ }
157
+
158
+ FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(StringValuePtr(path));
159
+ if (format == FIF_UNKNOWN) { rb_raise(rb_eArgError, "can't save to unrecognized image format"); }
160
+
161
+ Page* page;
162
+ Data_Get_Struct(self, Page, page);
163
+ return (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
164
+ }
165
+
166
+ VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
167
+ // use Ruby's argument scanner to pull out a required
168
+ VALUE rb_document, page_index, options;
169
+ int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
170
+
171
+ // Get the PDFShaver namespace and get the `Page` class inside it.
172
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
173
+ VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
174
+
175
+ Document* document;
176
+ Data_Get_Struct(rb_document, Document, document);
177
+
178
+ Page* page;
179
+ Data_Get_Struct(self, Page, page);
180
+
181
+ page->load(document, FIX2INT(page_index));
182
+
183
+ rb_ivar_set(self, rb_intern("@width"), INT2FIX(page->width()));
184
+ rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
185
+ rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
186
+
187
+ return self;
188
+ }
189
+
190
+ static void destroy_page(Page* page) { delete page; }