pdfium 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +9 -0
  5. data/Guardfile +7 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +68 -0
  8. data/Rakefile +62 -0
  9. data/ext/pdfium_ext/bookmark.cc +221 -0
  10. data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
  11. data/ext/pdfium_ext/document.cc +268 -0
  12. data/ext/pdfium_ext/document.h +66 -0
  13. data/ext/pdfium_ext/document_wrapper.cc +63 -0
  14. data/ext/pdfium_ext/document_wrapper.h +56 -0
  15. data/ext/pdfium_ext/extconf.h +3 -0
  16. data/ext/pdfium_ext/extconf.rb +76 -0
  17. data/ext/pdfium_ext/image.cc +332 -0
  18. data/ext/pdfium_ext/page.cc +392 -0
  19. data/ext/pdfium_ext/page.h +5 -0
  20. data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
  21. data/ext/pdfium_ext/page_object_wrapper.h +27 -0
  22. data/ext/pdfium_ext/page_wrapper.cc +86 -0
  23. data/ext/pdfium_ext/page_wrapper.h +37 -0
  24. data/ext/pdfium_ext/pdfium.cc +115 -0
  25. data/ext/pdfium_ext/pdfium.h +69 -0
  26. data/lib/pdfium.rb +15 -0
  27. data/lib/pdfium/bookmark_list.rb +28 -0
  28. data/lib/pdfium/bounding_box.rb +16 -0
  29. data/lib/pdfium/image_list.rb +21 -0
  30. data/lib/pdfium/page_list.rb +36 -0
  31. data/lib/pdfium/page_sizes.rb +7 -0
  32. data/lib/pdfium/version.rb +4 -0
  33. data/pdfium.gemspec +29 -0
  34. data/test/benchmark-docsplit.rb +41 -0
  35. data/test/bookmarks_list_spec.rb +26 -0
  36. data/test/bookmarks_spec.rb +34 -0
  37. data/test/debug.rb +24 -0
  38. data/test/document_spec.rb +49 -0
  39. data/test/image_list_spec.rb +18 -0
  40. data/test/image_spec.rb +53 -0
  41. data/test/page_list_spec.rb +24 -0
  42. data/test/page_spec.rb +91 -0
  43. data/test/pdfium_spec.rb +15 -0
  44. data/test/profile.rb +29 -0
  45. data/test/spec_helper.rb +31 -0
  46. metadata +202 -0
@@ -0,0 +1,392 @@
1
+ #include "page.h"
2
+ #include "page_wrapper.h"
3
+ #include "pdfium.h"
4
+ #include <FreeImage.h>
5
+
6
+ #include "fpdfview.h"
7
+ #include <limits.h>
8
+ #include <algorithm>
9
+ #include <string>
10
+ extern "C" {
11
+ #include "ruby/encoding.h"
12
+ }
13
+
14
+ static VALUE rb_sym_height;
15
+ static VALUE rb_sym_width;
16
+ //static VALUE RB_Document;
17
+
18
+
19
+ /////////////////////////////////////////////////////////////////////////
20
+ // The Page class //
21
+ /////////////////////////////////////////////////////////////////////////
22
+
23
+ /*
24
+ * Document-class: PDFium::Page
25
+ *
26
+ * A Page on a PDF Document
27
+ */
28
+ static void
29
+ page_gc_free(PageWrapper* page)
30
+ {
31
+ DEBUG_MSG("GC Free Page: " << page);
32
+ // The page's destructor will remove itself from the Document, and perform all cleanup
33
+ page->markUnused();
34
+ }
35
+
36
+
37
+ /*
38
+ * call-seq:
39
+ * Page.new -> raises RuntimeError
40
+ *
41
+ * Pages cannot be created by using Page.new, instead Page.open or Page.create
42
+ * should be used
43
+ */
44
+ static VALUE
45
+ page_new(VALUE klass)
46
+ {
47
+ rb_raise(rb_eRuntimeError, "Use Page.open or Page.create");
48
+ }
49
+
50
+
51
+ /*
52
+ * call-seq:
53
+ * Page.open(PDFIum::Document, page_index) -> Page
54
+ *
55
+ * Opens a given page on a document
56
+ */
57
+ static VALUE
58
+ page_open(VALUE klass, VALUE rb_document, VALUE rb_page_number)
59
+ {
60
+ DocumentWrapper *doc_wrapper;
61
+ Data_Get_Struct(rb_document, DocumentWrapper, doc_wrapper);
62
+
63
+ int pg = FIX2INT(rb_page_number);
64
+ if ( pg < 0 || pg >= doc_wrapper->document->GetPageCount() ){
65
+ rb_raise(rb_eRangeError, "%d is out of range: 0...%d",
66
+ pg, doc_wrapper->document->GetPageCount() );
67
+ }
68
+
69
+ PageWrapper *page_wrapper = new PageWrapper(doc_wrapper, FIX2INT(rb_page_number));
70
+ return Data_Wrap_Struct(klass, NULL, page_gc_free, page_wrapper);
71
+ }
72
+
73
+
74
+
75
+
76
+ /*
77
+ * call-seq:
78
+ * Page.create(PDFIum::Document, page_number=document.page_count) -> Page
79
+ *
80
+ * Creates a new page on a document. The page_number defaults to the
81
+ * Document#page_count, causing pages to be appended to the back of the document
82
+ * by default if no page_number is given.
83
+ */
84
+ static VALUE
85
+ page_create(int argc, VALUE *argv, VALUE klass)
86
+ {
87
+ VALUE rb_document, rb_page_number, options;
88
+ rb_scan_args(argc, argv, "11:", &rb_document, &rb_page_number, &options);
89
+ if (NIL_P(options)){
90
+ options=rb_hash_new();
91
+ rb_hash_aset(options, ID2SYM(rb_intern("size")),
92
+ rb_const_get(RB::PDFium(), rb_intern("LETTER")) );
93
+ }
94
+
95
+ VALUE size, rb_width, rb_height;
96
+ if ( !NIL_P(size = RB::get_option(options,"size")) ){
97
+ rb_width = rb_ary_entry(size, 0);
98
+ rb_height = rb_ary_entry(size, 1);
99
+ } else {
100
+ rb_width = RB::get_option(options,"width");
101
+ rb_height = RB::get_option(options,"height");
102
+ }
103
+
104
+
105
+ if ( NIL_P(rb_width) || NIL_P(rb_height) ){
106
+ rb_raise(rb_eArgError, ":height or :width must be given");
107
+ }
108
+
109
+ DocumentWrapper *doc_wrapper;
110
+ Data_Get_Struct(rb_document, DocumentWrapper, doc_wrapper);
111
+
112
+ int page_number;
113
+ if (NIL_P(rb_page_number)){
114
+ page_number = doc_wrapper->document->GetPageCount();
115
+ } else {
116
+ page_number = FIX2INT(rb_page_number);
117
+ }
118
+
119
+ if ( page_number < 0 || page_number > doc_wrapper->document->GetPageCount() ){
120
+ rb_raise(rb_eRangeError, "%d is out of range: 0...%d",
121
+ page_number, doc_wrapper->document->GetPageCount() );
122
+ }
123
+
124
+
125
+ CPDF_Page* newpage = (CPDF_Page*)FPDFPage_New(doc_wrapper->document, page_number,
126
+ FIX2INT(rb_width), FIX2INT(rb_height) );
127
+
128
+ PageWrapper *page_wrapper = new PageWrapper(doc_wrapper, rb_page_number);
129
+ page_wrapper->setPage(newpage);
130
+ VALUE i=Data_Wrap_Struct(klass, NULL, page_gc_free, page_wrapper);
131
+ return i;
132
+ }
133
+
134
+
135
+ /*
136
+ * call-seq:
137
+ * width -> Float
138
+ *
139
+ * Returns the width of the page.
140
+ * The width is given in terms of points, which are set to 72 per inch. (DPI)
141
+ */
142
+ static VALUE
143
+ page_width(VALUE self)
144
+ {
145
+ return rb_float_new( FPDF_GetPageWidth(RB2PG(self)) );
146
+ }
147
+
148
+ /*
149
+ * call-seq:
150
+ * height -> Float
151
+ *
152
+ * Returns the height of the page.
153
+ * The height is given in terms of points, which are set to 72 per inch. (DPI)
154
+ */
155
+ static VALUE
156
+ page_height(VALUE self)
157
+ {
158
+ return rb_float_new( FPDF_GetPageHeight(RB2PG(self)) );
159
+ }
160
+
161
+
162
+ /*
163
+ * call-seq:
164
+ * text -> String
165
+ *
166
+ * Returns the text that is contained on the page as a UTF-16LE encoded string
167
+ */
168
+ static VALUE
169
+ page_text(VALUE self)
170
+ {
171
+ static rb_encoding *enc = rb_enc_find("UTF-16LE");
172
+
173
+
174
+ PageWrapper *pw;
175
+ Data_Get_Struct(self, PageWrapper, pw);
176
+
177
+ CPDF_Page *page = pw->page();
178
+ IPDF_TextPage *text_page = (IPDF_TextPage*)FPDFText_LoadPage(page);
179
+ //
180
+ unsigned int buff_size = text_page->CountChars()*2 + 1; // 16 bit per char, plus terminator
181
+ char *buffer = ALLOC_N(char, buff_size );
182
+
183
+
184
+
185
+ FPDFText_GetText((FPDF_TEXTPAGE)text_page, 0, text_page->CountChars(), (unsigned short*)buffer);
186
+
187
+
188
+ VALUE ret = rb_enc_str_new(buffer, buff_size-1, enc);
189
+
190
+ xfree(buffer);
191
+
192
+ return ret;
193
+ }
194
+
195
+
196
+
197
+
198
+ /*
199
+ * call-seq:
200
+ * number -> Fixnum
201
+ *
202
+ * Returns the page number that the page represents on the document.
203
+ * It is *NOT* zero based, meaning that the first page#number will be 1.
204
+ *
205
+ * *Warning:* if pages are added/removed after the page is loaded, this value will be inaccurate.
206
+ */
207
+ static VALUE
208
+ page_number(VALUE self)
209
+ {
210
+ PageWrapper *pw;
211
+ Data_Get_Struct(self, PageWrapper, pw);
212
+ return INT2FIX(pw->_page_number+1);
213
+ }
214
+
215
+
216
+
217
+ /*
218
+ call-seq:
219
+ images -> ImageList
220
+
221
+ Returns ImageList which contains all the images on the page. Images are lazily loaded only when requested.
222
+
223
+ === Example
224
+ pdf = PDFium::Document.new( "test.pdf" )
225
+ page = pdf.pages.first
226
+ page.images.each do | image |
227
+ image.save("pg-#{page.number}-#{image.index}.png")
228
+ end
229
+
230
+ */
231
+ static VALUE
232
+ page_images(VALUE self)
233
+ {
234
+ VALUE args[1];
235
+ args[0] = self;
236
+ return rb_class_new_instance( 1, args, RB::ImageList() );
237
+ }
238
+
239
+ /*
240
+ call-seq:
241
+ as_image(width:nil, height:nil) -> Image
242
+
243
+ Render a page as an image of width and height to the given file. The image type
244
+ will be auto-detected from the file_path's extension, and can be any of the
245
+ formats supported by the FreeImage library http://freeimage.sourceforge.net/features.html
246
+
247
+ If neither the height or width are given, it will be calculated to retain the
248
+ approprate page scale.
249
+
250
+ Returns an Image instance.
251
+
252
+ === Example
253
+ pdf = PDFium::Document.new( "test.pdf" )
254
+ page = pdf.pages[0]
255
+ page.as_image(height: 100, width: 75).save("pg-#{page.number}-sm.png")
256
+ page.as_image(height: 500).save("pg-#{page.number}-md.png")
257
+ page.as_image(width: 1000).save("pg-#{page.number}-lg.png")
258
+
259
+ If the above page's #dimensions were 1000x1500, then the following images would be generated:
260
+ pg-1-sm.png -> 100x75
261
+ pg-1-md.png -> 500x750
262
+ pg-1-lg.png -> 750x1000
263
+ */
264
+ static VALUE
265
+ page_as_image(int argc, VALUE *argv, VALUE self)
266
+ {
267
+ CPDF_Page *page = RB2PG(self);
268
+
269
+ VALUE rb_options;
270
+ rb_scan_args(argc,argv,":", &rb_options);
271
+ if (NIL_P(rb_options)){
272
+ rb_options=rb_hash_new();
273
+ }
274
+ if ( TYPE(rb_options) != T_HASH ){
275
+ rb_raise(rb_eTypeError, "wrong argument type %s (expected Hash)", rb_obj_classname(rb_options));
276
+ }
277
+
278
+ VALUE width_option = rb_hash_aref(rb_options, rb_sym_width);
279
+ VALUE height_option = rb_hash_aref(rb_options, rb_sym_height);
280
+
281
+ int width = NIL_P(width_option) ? 0 : FIX2INT(width_option);
282
+ int height = NIL_P(height_option) ? 0 : FIX2INT(height_option);
283
+ if (!width && !height){
284
+ width = FPDF_GetPageWidth(page) * 2;
285
+ }
286
+
287
+ if (!width)
288
+ width = FPDF_GetPageWidth(page) * ( (double)height / FPDF_GetPageHeight(page) );
289
+ if (!height)
290
+ height = FPDF_GetPageHeight(page) * ( (double)width / FPDF_GetPageWidth(page) );
291
+
292
+ VALUE args[2];
293
+ args[0] = self;
294
+ VALUE img_options = args[1] = rb_hash_new();
295
+ rb_hash_aset(img_options, rb_sym_width, INT2FIX(width));
296
+ rb_hash_aset(img_options, rb_sym_height, INT2FIX(height));
297
+
298
+ VALUE bounds_args[4];
299
+ bounds_args[0] = rb_float_new( 0 );
300
+ bounds_args[1] = rb_float_new( FPDF_GetPageWidth(page) );
301
+ bounds_args[2] = rb_float_new( 0 );
302
+ bounds_args[3] = rb_float_new( FPDF_GetPageHeight(page) );
303
+ VALUE bounds = rb_class_new_instance( 4, bounds_args, RB::BoundingBox() );
304
+ rb_hash_aset(img_options, ID2SYM(rb_intern("bounds")), bounds);
305
+
306
+ return rb_class_new_instance( 2, args, RB::Image() );
307
+ }
308
+
309
+ /*
310
+ * call-seq:
311
+ * unload -> Page
312
+ *
313
+ * Frees a large portion of the internal memory allocated to the page.
314
+ * When a page is parsed by the PDFIum engine, various elements are cached in memory
315
+ * While Ruby will eventually garbage collect the Page instance once it's no longer
316
+ * in use, this method will free the memory immediatly. Page#unload is safe to use
317
+ * since the Page will re-load itself as needed, but calling it while the page
318
+ * is still in use will cause additional work by the engine since it will have to
319
+ * repeatedly re-parse the page when it re-loads itself.
320
+ *
321
+ * PageList#each will call this method on each page after it yields.
322
+ */
323
+ static VALUE
324
+ page_unload(VALUE self)
325
+ {
326
+ PageWrapper *pw;
327
+ Data_Get_Struct(self, PageWrapper, pw);
328
+ pw->unload();
329
+ return self;
330
+ }
331
+
332
+ // creates and yeilds an image. Not documented since all access
333
+ // should got through the ImageList interface via the Page#images method
334
+ /* :nodoc: */
335
+ static VALUE
336
+ page_each_image(VALUE self)
337
+ {
338
+ PageWrapper *pw;
339
+ Data_Get_Struct(self, PageWrapper, pw);
340
+
341
+ auto count = pw->page()->CountObjects();
342
+ int image_index=0;
343
+ for (int index=0; index < count; index++){
344
+ CPDF_PageObject *object = pw->page()->GetObjectByIndex(index);
345
+ if ( PDFPAGE_IMAGE == object->m_Type ){
346
+ VALUE args[2];
347
+ args[0] = self;
348
+ VALUE img_options = args[1] = rb_hash_new();
349
+
350
+ rb_hash_aset(img_options, ID2SYM(rb_intern("object_index")), INT2FIX(index));
351
+
352
+ rb_hash_aset(img_options, ID2SYM(rb_intern("index")), INT2FIX(image_index));
353
+
354
+ VALUE img = rb_class_new_instance( 2, args, RB::Image() );
355
+ rb_yield( img );
356
+ image_index++;
357
+ }
358
+ }
359
+ return self;
360
+ }
361
+
362
+
363
+ VALUE
364
+ define_page_class()
365
+ {
366
+ rb_sym_width = ID2SYM(rb_intern("width"));
367
+ rb_sym_height = ID2SYM(rb_intern("height"));
368
+
369
+ VALUE RB_PDFium = RB::PDFium();
370
+
371
+ // The Page class definition and methods
372
+ VALUE RB_Page = rb_define_class_under(RB_PDFium, "Page", rb_cObject);
373
+ //rb_define_alloc_func (RB_Page, page_allocate);
374
+ //rb_define_private_method (RB_Page, "initialize", RUBY_METHOD_FUNC(page_initialize), -1);
375
+
376
+ rb_define_singleton_method(RB_Page, "new", RUBY_METHOD_FUNC(page_new), 0);
377
+ rb_define_singleton_method(RB_Page, "open", RUBY_METHOD_FUNC(page_open), 2);
378
+ rb_define_singleton_method(RB_Page, "create", RUBY_METHOD_FUNC(page_create), -1);
379
+
380
+ rb_define_method (RB_Page, "text", RUBY_METHOD_FUNC(page_text), 0);
381
+ rb_define_method (RB_Page, "width", RUBY_METHOD_FUNC(page_width), 0);
382
+ rb_define_method (RB_Page, "height", RUBY_METHOD_FUNC(page_height), 0);
383
+ rb_define_method (RB_Page, "as_image", RUBY_METHOD_FUNC(page_as_image), -1);
384
+ rb_define_method (RB_Page, "unload", RUBY_METHOD_FUNC(page_unload), 0);
385
+ rb_define_method (RB_Page, "number", RUBY_METHOD_FUNC(page_number), 0);
386
+
387
+ rb_define_method (RB_Page, "images", RUBY_METHOD_FUNC(page_images), 0);
388
+
389
+ rb_define_method (RB_Page, "each_image", RUBY_METHOD_FUNC(page_each_image), 0);
390
+
391
+ return RB_Page;
392
+ }
@@ -0,0 +1,5 @@
1
+ #ifndef __PAGE_H__
2
+ #define __PAGE_H__
3
+
4
+
5
+ #endif // __PAGE_H__
@@ -0,0 +1,38 @@
1
+ #include "pdfium.h"
2
+ #include "page_wrapper.h"
3
+
4
+ PageObjectWrapper::PageObjectWrapper():
5
+ page_wrapper(0),
6
+ object(0),
7
+ page_object_index(-1)
8
+ {}
9
+
10
+ PageObjectWrapper::~PageObjectWrapper(){
11
+ if (page_wrapper)
12
+ page_wrapper->release(this);
13
+ if (object){
14
+ // object->Release();
15
+ }
16
+ }
17
+
18
+ void
19
+ PageObjectWrapper::wrap(CPDF_PageObject *obj, PageWrapper *pg){
20
+ this->object = obj;
21
+ this->page_wrapper = pg;
22
+ this->page_wrapper->retain(this);
23
+ }
24
+
25
+
26
+
27
+ CPDF_ImageObject*
28
+ RB2IMG(VALUE self) {
29
+ PageObjectWrapper* po;
30
+ Data_Get_Struct(self, PageObjectWrapper, po);
31
+ return static_cast<CPDF_ImageObject*>(po->object);
32
+ }
33
+
34
+ void
35
+ ImageWrapper::wrap(PageWrapper *page_wrapper){
36
+ this->page_wrapper = page_wrapper;
37
+ this->page_wrapper->retain(this);
38
+ }
@@ -0,0 +1,27 @@
1
+ #ifndef __PAGE_OBJECT_WRAPPER_H__
2
+ #define __PAGE_OBJECT_WRAPPER_H__
3
+
4
+ class PageWrapper;
5
+
6
+ class PageObjectWrapper {
7
+ public:
8
+ PageObjectWrapper();
9
+ ~PageObjectWrapper();
10
+
11
+ void wrap(CPDF_PageObject *object, PageWrapper *page_wrapper);
12
+
13
+ PageWrapper *page_wrapper;
14
+ CPDF_PageObject *object;
15
+ int page_object_index;
16
+ };
17
+
18
+
19
+ class ImageWrapper : public PageObjectWrapper {
20
+ public:
21
+
22
+ void wrap(PageWrapper *page_wrapper);
23
+
24
+ };
25
+
26
+
27
+ #endif // __PAGE_OBJECT_WRAPPER_H__