pdfshaver 0.0.1.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5a34c73145fff7cd843f8770b4cdb14648b0d8e8
4
+ data.tar.gz: 543a70e710dc09646580d1bb0fff7bce6f41fe5e
5
+ SHA512:
6
+ metadata.gz: 3bca072b82f7710883dec16c6be56fcd696ef074c40b55ca1139f3671985ea7690fad1f2e7cfb0333f4ec480d1b92b2c722e3fa990a1e7a8b19f6b2bbc814ac0
7
+ data.tar.gz: 3ec48a4b055c0b985567f5922194dbd853a489e392f64cf5ee9b641dd3e45ce9641b2028b4b93d6d802c10516b8cfb4e37575e73c29d27d6a831c31f0a551bf1
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/extensiontask'
3
+
4
+ Rake::ExtensionTask.new('pdfium_ruby')
5
+
6
+ task :test => :compile do
7
+ Dir.glob(File.join File.dirname(__FILE__), "test", "**", "*_spec.rb").each{ |test| require test }
8
+ end
9
+
10
+ task(default: :test)
data/Readme.md ADDED
@@ -0,0 +1,14 @@
1
+ # PDFShaver
2
+
3
+ Shave pages off of PDFs as images
4
+
5
+ ### N.B. This is a work in process
6
+
7
+ ### Examples
8
+
9
+ require 'pdfshaver'
10
+ document = PDFShaver::Document.new('./path/to/document.pdf')
11
+ landscape_pages = document.pages.select{ |page| page.aspect > 1 }
12
+ landscape_pages.each{ |page| page.render("./page_#{page.number}.gif") }
13
+
14
+ copyright 2015 Ted Han, Nathan Stitt & DocumentCloud
@@ -0,0 +1,131 @@
1
+ #include "document.h"
2
+
3
+ /********************************************
4
+ * C++ Document definition
5
+ *********************************************/
6
+ Document::Document() {
7
+ // Initialize state variables
8
+ // to mark whether document has been used
9
+ // and whether document is freeable.
10
+ this->opened = false;
11
+ this->ready_to_be_freed = true;
12
+ }
13
+
14
+ Document::~Document() {
15
+ // make sure the document exists and was initialized before
16
+ // trying to close it.
17
+ if (this->opened) { FPDF_CloseDocument(this->fpdf_document); }
18
+ }
19
+
20
+ int Document::load(VALUE path) {
21
+ // load the document via PDFium.
22
+ // returns false if loading document fails.
23
+ this->fpdf_document = FPDF_LoadDocument(StringValuePtr(path), NULL);
24
+ int parse_status = FPDF_GetLastError();
25
+ // indicate that Ruby is still using this document.
26
+ this->opened = this->isValid();
27
+ this->ready_to_be_freed = false;
28
+ return parse_status;
29
+ }
30
+
31
+ int Document::length() { return FPDF_GetPageCount(this->fpdf_document); }
32
+
33
+ bool Document::isValid() { return !!(this->fpdf_document); }
34
+
35
+ void Document::flagDocumentAsReadyForRelease() { this->ready_to_be_freed = true; }
36
+
37
+
38
+ void Document::notifyPageOpened(Page* page) {
39
+ //ruby_puts_cstring("Adding page to open pages");
40
+ this->open_pages.insert(page);
41
+ }
42
+ void Document::notifyPageClosed(Page* page) {
43
+ //ruby_puts_cstring("Removing page from open pages");
44
+ this->open_pages.erase(page);
45
+ this->destroyUnlessPagesAreOpen();
46
+ }
47
+ void Document::destroyUnlessPagesAreOpen() {
48
+ // once the document is no longer being used, and none of its child pages are open
49
+ // it's safe to destroy.
50
+ if (!(this->opened) || (this->opened && this->ready_to_be_freed && this->open_pages.empty())) {
51
+ //ruby_puts_cstring("Deleting Document");
52
+ delete this;
53
+ }
54
+ }
55
+
56
+ /********************************************
57
+ * Ruby class definition and initialization
58
+ *********************************************/
59
+
60
+ void Define_Document() {
61
+ // Get the PDFShaver namespace and get the `Document` class inside it.
62
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
63
+ VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
64
+
65
+ rb_define_alloc_func(rb_PDFShaver_Document, *document_allocate);
66
+
67
+ rb_define_private_method(rb_PDFShaver_Document, "open_document_with_pdfium",
68
+ CPP_RUBY_METHOD_FUNC(initialize_document_internals), -1);
69
+ };
70
+
71
+ VALUE document_allocate(VALUE rb_PDFShaver_Document) {
72
+ Document* document = new Document();
73
+ return Data_Wrap_Struct(rb_PDFShaver_Document, NULL, destroy_document_when_safe, document);
74
+ }
75
+
76
+ // Entry point for PDFShaver::Document's ruby initializer into C++ land
77
+ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self) {
78
+ // Get the PDFShaver namespace and get the `Document` class inside it.
79
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
80
+ VALUE rb_PDFShaver_Document = rb_const_get(rb_PDFShaver, rb_intern("Document"));
81
+
82
+ // use Ruby's argument scanner to pull out a required
83
+ // `path` argument and an optional `options` hash.
84
+ VALUE path, options;
85
+ int number_of_args = rb_scan_args(arg_count, args, "11", &path, &options);
86
+
87
+ // attempt to open document.
88
+ // path should at this point be validated & known to exist.
89
+ Document* document;
90
+ Data_Get_Struct(self, Document, document);
91
+ int parse_status = document->load(path);
92
+ //document_handle_parse_status(parse_status, path);
93
+ if (!document->isValid()) { rb_raise(rb_eArgError, "failed to open file (%" PRIsVALUE")", path); }
94
+
95
+ // get the document length and store it as an instance variable on the class.
96
+ rb_ivar_set(self, rb_intern("@length"), INT2FIX(document->length()));
97
+ return self;
98
+ }
99
+
100
+ void document_handle_parse_status(int status, VALUE path) {
101
+ //printf("\nSTATUS: %d\n", status);
102
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
103
+ VALUE rb_eEncryptionError = rb_const_get(rb_PDFShaver, rb_intern("EncryptionError"));
104
+ VALUE rb_eInvalidFormatError = rb_const_get(rb_PDFShaver, rb_intern("InvalidFormatError"));
105
+ VALUE rb_eMissingHandlerError = rb_const_get(rb_PDFShaver, rb_intern("MissingHandlerError"));
106
+
107
+ //switch (status) {
108
+ // case PDFPARSE_ERROR_SUCCESS:
109
+ // break;
110
+ // case PDFPARSE_ERROR_FILE:
111
+ // rb_raise(rb_eArgError, "unable to open file (%" PRIsVALUE")", path);
112
+ // break;
113
+ // case PDFPARSE_ERROR_FORMAT:
114
+ // rb_raise(rb_eInvalidFormatError, "file (%" PRIsVALUE") is not a valid PDF", path);
115
+ // break;
116
+ // case PDFPARSE_ERROR_PASSWORD:
117
+ // rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
118
+ // break;
119
+ // case PDFPARSE_ERROR_CERT:
120
+ // rb_raise(rb_eEncryptionError, "file (%" PRIsVALUE") is encrypted", path);
121
+ // break;
122
+ // case PDFPARSE_ERROR_HANDLER:
123
+ // rb_raise(rb_eMissingHandlerError, "could not find handler for media objects in file (%" PRIsVALUE")", path);
124
+ // break;
125
+ //}
126
+ }
127
+
128
+ static void destroy_document_when_safe(Document* document) {
129
+ document->flagDocumentAsReadyForRelease();
130
+ document->destroyUnlessPagesAreOpen();
131
+ }
@@ -0,0 +1,53 @@
1
+ #ifndef __DOCUMENT_H__
2
+ #define __DOCUMENT_H__
3
+
4
+ // forward declaration since Page/Document classes are interdependent
5
+ class Page;
6
+ #include "pdfium_ruby.h"
7
+ #include "fpdf_ext.h"
8
+ //#include "core/include/fpdfapi/fpdf_parser.h"
9
+ #include "page.h"
10
+ #include <unordered_set>
11
+
12
+ // C++ Class to wrap lifecycle of
13
+ // PDF Documents opened through PDFium.
14
+ class Document {
15
+ public:
16
+ FPDF_DOCUMENT fpdf_document;
17
+
18
+ // constructor
19
+ Document();
20
+
21
+ int load(VALUE path);
22
+
23
+ // wrapper for PDFium's pageCount
24
+ int length();
25
+
26
+ bool isValid();
27
+
28
+ // flag to set instances as ready to be disposed of
29
+ // pending ensuring all its pages have been first closed.
30
+ void flagDocumentAsReadyForRelease();
31
+
32
+ // a guard for the destructor.
33
+ void destroyUnlessPagesAreOpen();
34
+
35
+ void notifyPageClosed(Page* page);
36
+ void notifyPageOpened(Page* page);
37
+
38
+ // destructor
39
+ ~Document();
40
+
41
+ private:
42
+ //bool subscribeToPage(Page* page);
43
+ bool opened;
44
+ bool ready_to_be_freed;
45
+ std::unordered_set<Page*> open_pages;
46
+ };
47
+
48
+ static void destroy_document_when_safe(Document* document);
49
+
50
+ VALUE initialize_document_internals(int arg_count, VALUE* args, VALUE self);
51
+ VALUE document_allocate(VALUE rb_PDFShaver_Document);
52
+ void document_handle_parse_status(int status, VALUE path);
53
+ #endif // __DOCUMENT_H__
@@ -0,0 +1,68 @@
1
+ require "mkmf"
2
+ require 'rbconfig'
3
+ # List directories to search for PDFium headers and library files to link against
4
+ def append_pdfium_directory_to paths
5
+ paths.map do |dir|
6
+ [
7
+ File.join(dir, 'pdfium'),
8
+ File.join(dir, 'pdfium', 'fpdfsdk', 'include'),
9
+ File.join(dir, 'pdfium', 'third_party', 'base', 'numerics')
10
+ ]
11
+ end.flatten + paths
12
+ end
13
+
14
+ LIB_DIRS = append_pdfium_directory_to %w[
15
+ /usr/local/lib/
16
+ /usr/lib/
17
+ ]
18
+ HEADER_DIRS = append_pdfium_directory_to %w[
19
+ /usr/local/include/
20
+ /usr/include/
21
+ ]
22
+
23
+ # Tell ruby we want to search in the specified paths
24
+ dir_config("pdfium", HEADER_DIRS, LIB_DIRS)
25
+
26
+ LIB_FILES= %w[
27
+ javascript
28
+ bigint
29
+ freetype
30
+ fpdfdoc
31
+ fpdftext
32
+ formfiller
33
+ icudata
34
+ icuuc
35
+ icui18n
36
+ v8_libbase
37
+ v8_base
38
+ v8_snapshot
39
+ v8_libplatform
40
+ jsapi
41
+ pdfwindow
42
+ fxedit
43
+ fxcrt
44
+ fxcodec
45
+ fpdfdoc
46
+ fdrm
47
+ fxge
48
+ fpdfapi
49
+ freetype
50
+ pdfium
51
+ pthread
52
+ freeimage
53
+ ]
54
+
55
+ LIB_FILES.each do | lib |
56
+ have_library(lib) or abort "Couldn't find library lib#{lib} in #{LIB_DIRS.join(', ')}"
57
+ end
58
+
59
+ $CPPFLAGS += " -fPIC -std=c++11"
60
+ if RUBY_PLATFORM =~ /darwin/
61
+ have_library('objc')
62
+ FRAMEWORKS = %w{AppKit CoreFoundation}
63
+ $LDFLAGS << FRAMEWORKS.map { |f| " -framework #{f}" }.join
64
+ else
65
+ #$CPPFLAGS += " -fPIC -std=c++11"
66
+ end
67
+
68
+ create_makefile "pdfium_ruby"
@@ -0,0 +1,190 @@
1
+ #include "page.h"
2
+ #include <FreeImage.h>
3
+
4
+ /********************************************
5
+ * C++ Page definition
6
+ *********************************************/
7
+
8
+ Page::Page() { this->opened = false; }
9
+
10
+ bool Page::load(Document* document, int page_index) {
11
+ this->document = document;
12
+ this->page_index = page_index;
13
+
14
+ this->fpdf_page = FPDF_LoadPage(document->fpdf_document, page_index);
15
+ document->notifyPageOpened(this);
16
+ this->opened = true;
17
+ return this->opened;
18
+ }
19
+
20
+ double Page::width(){ return FPDF_GetPageWidth(this->fpdf_page); }
21
+ double Page::height(){ return FPDF_GetPageHeight(this->fpdf_page); }
22
+ double Page::aspect() { return width() / height(); }
23
+
24
+ bool Page::render(char* path, int width, int height) {
25
+ // If no height or width is supplied, render at natural dimensions.
26
+ if (!width && !height) {
27
+ width = (int)(this->width());
28
+ height = (int)(this->height());
29
+ }
30
+ // When given only a height or a width,
31
+ // infer the other by preserving page aspect ratio.
32
+ if ( width && !height) { height = width / this->aspect(); }
33
+ if (!width && height) { width = height * this->aspect(); }
34
+ //printf("Derp? %d, %d\n", width, height);
35
+
36
+ // Create bitmap. width, height, alpha 1=enabled,0=disabled
37
+ bool alpha = false;
38
+ FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, alpha);
39
+ if (!bitmap) { return false; }
40
+
41
+ // fill all pixels with white for the background color
42
+ FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF);
43
+
44
+ // Render a page to a bitmap in RGBA format
45
+ // args are: *buffer, page, start_x, start_y, size_x, size_y, rotation, and flags
46
+ // flags are:
47
+ // 0 for normal display, or combination of flags defined below
48
+ // 0x01 Set if annotations are to be rendered
49
+ // 0x02 Set if using text rendering optimized for LCD display
50
+ // 0x04 Set if you don't want to use GDI+
51
+ int start_x = 0;
52
+ int start_y = 0;
53
+ int rotation = 0;
54
+ int flags = FPDF_PRINTING; // A flag defined in PDFium's codebase.
55
+ FPDF_RenderPageBitmap(bitmap, this->fpdf_page, start_x, start_y, width, height, rotation, flags);
56
+
57
+ // The stride holds the width of one row in bytes. It may not be an exact
58
+ // multiple of the pixel width because the data may be packed to always end on a byte boundary
59
+ int stride = FPDFBitmap_GetStride(bitmap);
60
+
61
+ // Safety checks to make sure that the bitmap
62
+ // is properly sized and can be safely manipulated
63
+ bool bitmapIsntValid = (
64
+ (stride < 0) ||
65
+ (width > INT_MAX / height) ||
66
+ ((stride * height) > (INT_MAX / 3))
67
+ );
68
+ if (bitmapIsntValid){
69
+ FPDFBitmap_Destroy(bitmap);
70
+ return false;
71
+ }
72
+
73
+ // Read the FPDF bitmap into a FreeImage bitmap.
74
+ unsigned bpp = 32;
75
+ unsigned red_mask = 0xFF0000;
76
+ unsigned green_mask = 0x00FF00;
77
+ unsigned blue_mask = 0x0000FF;
78
+ bool topdown = true;
79
+ FIBITMAP *raw = FreeImage_ConvertFromRawBits(
80
+ (BYTE*)FPDFBitmap_GetBuffer(bitmap), width, height, stride, bpp, red_mask, green_mask, blue_mask, topdown);
81
+
82
+ // at this point we're done with the FPDF bitmap and can destroy it.
83
+ FPDFBitmap_Destroy(bitmap);
84
+
85
+ // Conversion to jpg or gif require that the bpp be set to 24
86
+ // since we're not exporting using alpha transparency above in FPDFBitmap_Create
87
+ FIBITMAP *image = FreeImage_ConvertTo24Bits(raw);
88
+ FreeImage_Unload(raw);
89
+
90
+ // figure out the desired format from the file extension
91
+ FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(path);
92
+
93
+ bool success = false;
94
+ if ( FIF_GIF == format ){
95
+ // Gif requires quantization to drop to 8bpp
96
+ FIBITMAP *gif = FreeImage_ColorQuantize(image, FIQ_WUQUANT);
97
+ success = FreeImage_Save(FIF_GIF, gif, path, GIF_DEFAULT);
98
+ FreeImage_Unload(gif);
99
+ } else {
100
+ // All other formats should be just a save call
101
+ success = FreeImage_Save(format, image, path, 0);
102
+ }
103
+
104
+ // unload the image
105
+ FreeImage_Unload(image);
106
+
107
+ return success;
108
+ }
109
+
110
+ Page::~Page() {
111
+ if (this->opened) {
112
+ FPDF_ClosePage(this->fpdf_page);
113
+ this->document->notifyPageClosed(this);
114
+ }
115
+ }
116
+
117
+ /********************************************
118
+ * Ruby class definition and initialization
119
+ *********************************************/
120
+
121
+ void Define_Page() {
122
+ // Get the PDFShaver namespace and get the `Page` class inside it.
123
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
124
+ VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
125
+
126
+ rb_define_alloc_func(rb_PDFShaver_Page, *page_allocate);
127
+
128
+ rb_define_method(rb_PDFShaver_Page, "render", CPP_RUBY_METHOD_FUNC(page_render), -1);
129
+ rb_define_private_method(rb_PDFShaver_Page, "initialize_page_internals",
130
+ CPP_RUBY_METHOD_FUNC(initialize_page_internals),-1);
131
+ }
132
+
133
+ VALUE page_allocate(VALUE rb_PDFShaver_Page) {
134
+ Page* page = new Page();
135
+ return Data_Wrap_Struct(rb_PDFShaver_Page, NULL, destroy_page, page);
136
+ }
137
+
138
+ //bool page_render(int arg_count, VALUE* args, VALUE self) {
139
+ VALUE page_render(int arg_count, VALUE* args, VALUE self) {
140
+ VALUE path, options;
141
+ int width = 0, height = 0;
142
+
143
+ int number_of_args = rb_scan_args(arg_count, args, "1:", &path, &options);
144
+ if (arg_count > 1) {
145
+ VALUE rb_width = rb_hash_aref(options, ID2SYM(rb_intern("width")));
146
+ VALUE rb_height = rb_hash_aref(options, ID2SYM(rb_intern("height")));
147
+
148
+ if (!(NIL_P(rb_width))) {
149
+ if (FIXNUM_P(rb_width)) { width = FIX2INT(rb_width); }
150
+ else { rb_raise(rb_eArgError, ":width must be a integer"); }
151
+ }
152
+ if (!(NIL_P(rb_height))) {
153
+ if (FIXNUM_P(rb_height)) { height = FIX2INT(rb_height); }
154
+ else { rb_raise(rb_eArgError, ":height must be a integer"); }
155
+ }
156
+ }
157
+
158
+ FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(StringValuePtr(path));
159
+ if (format == FIF_UNKNOWN) { rb_raise(rb_eArgError, "can't save to unrecognized image format"); }
160
+
161
+ Page* page;
162
+ Data_Get_Struct(self, Page, page);
163
+ return (page->render(StringValuePtr(path), width, height) ? Qtrue : Qfalse);
164
+ }
165
+
166
+ VALUE initialize_page_internals(int arg_count, VALUE* args, VALUE self) {
167
+ // use Ruby's argument scanner to pull out a required
168
+ VALUE rb_document, page_index, options;
169
+ int number_of_args = rb_scan_args(arg_count, args, "21", &rb_document, &page_index, &options);
170
+
171
+ // Get the PDFShaver namespace and get the `Page` class inside it.
172
+ VALUE rb_PDFShaver = rb_const_get(rb_cObject, rb_intern("PDFShaver"));
173
+ VALUE rb_PDFShaver_Page = rb_const_get(rb_PDFShaver, rb_intern("Page"));
174
+
175
+ Document* document;
176
+ Data_Get_Struct(rb_document, Document, document);
177
+
178
+ Page* page;
179
+ Data_Get_Struct(self, Page, page);
180
+
181
+ page->load(document, FIX2INT(page_index));
182
+
183
+ rb_ivar_set(self, rb_intern("@width"), INT2FIX(page->width()));
184
+ rb_ivar_set(self, rb_intern("@height"), INT2FIX(page->height()));
185
+ rb_ivar_set(self, rb_intern("@aspect"), rb_float_new(page->aspect()));
186
+
187
+ return self;
188
+ }
189
+
190
+ static void destroy_page(Page* page) { delete page; }