pdfium 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +9 -0
  5. data/Guardfile +7 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +68 -0
  8. data/Rakefile +62 -0
  9. data/ext/pdfium_ext/bookmark.cc +221 -0
  10. data/ext/pdfium_ext/buffer_file_write.hpp +27 -0
  11. data/ext/pdfium_ext/document.cc +268 -0
  12. data/ext/pdfium_ext/document.h +66 -0
  13. data/ext/pdfium_ext/document_wrapper.cc +63 -0
  14. data/ext/pdfium_ext/document_wrapper.h +56 -0
  15. data/ext/pdfium_ext/extconf.h +3 -0
  16. data/ext/pdfium_ext/extconf.rb +76 -0
  17. data/ext/pdfium_ext/image.cc +332 -0
  18. data/ext/pdfium_ext/page.cc +392 -0
  19. data/ext/pdfium_ext/page.h +5 -0
  20. data/ext/pdfium_ext/page_object_wrapper.cc +38 -0
  21. data/ext/pdfium_ext/page_object_wrapper.h +27 -0
  22. data/ext/pdfium_ext/page_wrapper.cc +86 -0
  23. data/ext/pdfium_ext/page_wrapper.h +37 -0
  24. data/ext/pdfium_ext/pdfium.cc +115 -0
  25. data/ext/pdfium_ext/pdfium.h +69 -0
  26. data/lib/pdfium.rb +15 -0
  27. data/lib/pdfium/bookmark_list.rb +28 -0
  28. data/lib/pdfium/bounding_box.rb +16 -0
  29. data/lib/pdfium/image_list.rb +21 -0
  30. data/lib/pdfium/page_list.rb +36 -0
  31. data/lib/pdfium/page_sizes.rb +7 -0
  32. data/lib/pdfium/version.rb +4 -0
  33. data/pdfium.gemspec +29 -0
  34. data/test/benchmark-docsplit.rb +41 -0
  35. data/test/bookmarks_list_spec.rb +26 -0
  36. data/test/bookmarks_spec.rb +34 -0
  37. data/test/debug.rb +24 -0
  38. data/test/document_spec.rb +49 -0
  39. data/test/image_list_spec.rb +18 -0
  40. data/test/image_spec.rb +53 -0
  41. data/test/page_list_spec.rb +24 -0
  42. data/test/page_spec.rb +91 -0
  43. data/test/pdfium_spec.rb +15 -0
  44. data/test/profile.rb +29 -0
  45. data/test/spec_helper.rb +31 -0
  46. metadata +202 -0
@@ -0,0 +1,392 @@
1
+ #include "page.h"
2
+ #include "page_wrapper.h"
3
+ #include "pdfium.h"
4
+ #include <FreeImage.h>
5
+
6
+ #include "fpdfview.h"
7
+ #include <limits.h>
8
+ #include <algorithm>
9
+ #include <string>
10
+ extern "C" {
11
+ #include "ruby/encoding.h"
12
+ }
13
+
14
+ static VALUE rb_sym_height;
15
+ static VALUE rb_sym_width;
16
+ //static VALUE RB_Document;
17
+
18
+
19
+ /////////////////////////////////////////////////////////////////////////
20
+ // The Page class //
21
+ /////////////////////////////////////////////////////////////////////////
22
+
23
+ /*
24
+ * Document-class: PDFium::Page
25
+ *
26
+ * A Page on a PDF Document
27
+ */
28
+ static void
29
+ page_gc_free(PageWrapper* page)
30
+ {
31
+ DEBUG_MSG("GC Free Page: " << page);
32
+ // The page's destructor will remove itself from the Document, and perform all cleanup
33
+ page->markUnused();
34
+ }
35
+
36
+
37
+ /*
38
+ * call-seq:
39
+ * Page.new -> raises RuntimeError
40
+ *
41
+ * Pages cannot be created by using Page.new, instead Page.open or Page.create
42
+ * should be used
43
+ */
44
+ static VALUE
45
+ page_new(VALUE klass)
46
+ {
47
+ rb_raise(rb_eRuntimeError, "Use Page.open or Page.create");
48
+ }
49
+
50
+
51
+ /*
52
+ * call-seq:
53
+ * Page.open(PDFIum::Document, page_index) -> Page
54
+ *
55
+ * Opens a given page on a document
56
+ */
57
+ static VALUE
58
+ page_open(VALUE klass, VALUE rb_document, VALUE rb_page_number)
59
+ {
60
+ DocumentWrapper *doc_wrapper;
61
+ Data_Get_Struct(rb_document, DocumentWrapper, doc_wrapper);
62
+
63
+ int pg = FIX2INT(rb_page_number);
64
+ if ( pg < 0 || pg >= doc_wrapper->document->GetPageCount() ){
65
+ rb_raise(rb_eRangeError, "%d is out of range: 0...%d",
66
+ pg, doc_wrapper->document->GetPageCount() );
67
+ }
68
+
69
+ PageWrapper *page_wrapper = new PageWrapper(doc_wrapper, FIX2INT(rb_page_number));
70
+ return Data_Wrap_Struct(klass, NULL, page_gc_free, page_wrapper);
71
+ }
72
+
73
+
74
+
75
+
76
+ /*
77
+ * call-seq:
78
+ * Page.create(PDFIum::Document, page_number=document.page_count) -> Page
79
+ *
80
+ * Creates a new page on a document. The page_number defaults to the
81
+ * Document#page_count, causing pages to be appended to the back of the document
82
+ * by default if no page_number is given.
83
+ */
84
+ static VALUE
85
+ page_create(int argc, VALUE *argv, VALUE klass)
86
+ {
87
+ VALUE rb_document, rb_page_number, options;
88
+ rb_scan_args(argc, argv, "11:", &rb_document, &rb_page_number, &options);
89
+ if (NIL_P(options)){
90
+ options=rb_hash_new();
91
+ rb_hash_aset(options, ID2SYM(rb_intern("size")),
92
+ rb_const_get(RB::PDFium(), rb_intern("LETTER")) );
93
+ }
94
+
95
+ VALUE size, rb_width, rb_height;
96
+ if ( !NIL_P(size = RB::get_option(options,"size")) ){
97
+ rb_width = rb_ary_entry(size, 0);
98
+ rb_height = rb_ary_entry(size, 1);
99
+ } else {
100
+ rb_width = RB::get_option(options,"width");
101
+ rb_height = RB::get_option(options,"height");
102
+ }
103
+
104
+
105
+ if ( NIL_P(rb_width) || NIL_P(rb_height) ){
106
+ rb_raise(rb_eArgError, ":height or :width must be given");
107
+ }
108
+
109
+ DocumentWrapper *doc_wrapper;
110
+ Data_Get_Struct(rb_document, DocumentWrapper, doc_wrapper);
111
+
112
+ int page_number;
113
+ if (NIL_P(rb_page_number)){
114
+ page_number = doc_wrapper->document->GetPageCount();
115
+ } else {
116
+ page_number = FIX2INT(rb_page_number);
117
+ }
118
+
119
+ if ( page_number < 0 || page_number > doc_wrapper->document->GetPageCount() ){
120
+ rb_raise(rb_eRangeError, "%d is out of range: 0...%d",
121
+ page_number, doc_wrapper->document->GetPageCount() );
122
+ }
123
+
124
+
125
+ CPDF_Page* newpage = (CPDF_Page*)FPDFPage_New(doc_wrapper->document, page_number,
126
+ FIX2INT(rb_width), FIX2INT(rb_height) );
127
+
128
+ PageWrapper *page_wrapper = new PageWrapper(doc_wrapper, rb_page_number);
129
+ page_wrapper->setPage(newpage);
130
+ VALUE i=Data_Wrap_Struct(klass, NULL, page_gc_free, page_wrapper);
131
+ return i;
132
+ }
133
+
134
+
135
+ /*
136
+ * call-seq:
137
+ * width -> Float
138
+ *
139
+ * Returns the width of the page.
140
+ * The width is given in terms of points, which are set to 72 per inch. (DPI)
141
+ */
142
+ static VALUE
143
+ page_width(VALUE self)
144
+ {
145
+ return rb_float_new( FPDF_GetPageWidth(RB2PG(self)) );
146
+ }
147
+
148
+ /*
149
+ * call-seq:
150
+ * height -> Float
151
+ *
152
+ * Returns the height of the page.
153
+ * The height is given in terms of points, which are set to 72 per inch. (DPI)
154
+ */
155
+ static VALUE
156
+ page_height(VALUE self)
157
+ {
158
+ return rb_float_new( FPDF_GetPageHeight(RB2PG(self)) );
159
+ }
160
+
161
+
162
+ /*
163
+ * call-seq:
164
+ * text -> String
165
+ *
166
+ * Returns the text that is contained on the page as a UTF-16LE encoded string
167
+ */
168
+ static VALUE
169
+ page_text(VALUE self)
170
+ {
171
+ static rb_encoding *enc = rb_enc_find("UTF-16LE");
172
+
173
+
174
+ PageWrapper *pw;
175
+ Data_Get_Struct(self, PageWrapper, pw);
176
+
177
+ CPDF_Page *page = pw->page();
178
+ IPDF_TextPage *text_page = (IPDF_TextPage*)FPDFText_LoadPage(page);
179
+ //
180
+ unsigned int buff_size = text_page->CountChars()*2 + 1; // 16 bit per char, plus terminator
181
+ char *buffer = ALLOC_N(char, buff_size );
182
+
183
+
184
+
185
+ FPDFText_GetText((FPDF_TEXTPAGE)text_page, 0, text_page->CountChars(), (unsigned short*)buffer);
186
+
187
+
188
+ VALUE ret = rb_enc_str_new(buffer, buff_size-1, enc);
189
+
190
+ xfree(buffer);
191
+
192
+ return ret;
193
+ }
194
+
195
+
196
+
197
+
198
+ /*
199
+ * call-seq:
200
+ * number -> Fixnum
201
+ *
202
+ * Returns the page number that the page represents on the document.
203
+ * It is *NOT* zero based, meaning that the first page#number will be 1.
204
+ *
205
+ * *Warning:* if pages are added/removed after the page is loaded, this value will be inaccurate.
206
+ */
207
+ static VALUE
208
+ page_number(VALUE self)
209
+ {
210
+ PageWrapper *pw;
211
+ Data_Get_Struct(self, PageWrapper, pw);
212
+ return INT2FIX(pw->_page_number+1);
213
+ }
214
+
215
+
216
+
217
+ /*
218
+ call-seq:
219
+ images -> ImageList
220
+
221
+ Returns ImageList which contains all the images on the page. Images are lazily loaded only when requested.
222
+
223
+ === Example
224
+ pdf = PDFium::Document.new( "test.pdf" )
225
+ page = pdf.pages.first
226
+ page.images.each do | image |
227
+ image.save("pg-#{page.number}-#{image.index}.png")
228
+ end
229
+
230
+ */
231
+ static VALUE
232
+ page_images(VALUE self)
233
+ {
234
+ VALUE args[1];
235
+ args[0] = self;
236
+ return rb_class_new_instance( 1, args, RB::ImageList() );
237
+ }
238
+
239
+ /*
240
+ call-seq:
241
+ as_image(width:nil, height:nil) -> Image
242
+
243
+ Render a page as an image of width and height to the given file. The image type
244
+ will be auto-detected from the file_path's extension, and can be any of the
245
+ formats supported by the FreeImage library http://freeimage.sourceforge.net/features.html
246
+
247
+ If neither the height or width are given, it will be calculated to retain the
248
+ approprate page scale.
249
+
250
+ Returns an Image instance.
251
+
252
+ === Example
253
+ pdf = PDFium::Document.new( "test.pdf" )
254
+ page = pdf.pages[0]
255
+ page.as_image(height: 100, width: 75).save("pg-#{page.number}-sm.png")
256
+ page.as_image(height: 500).save("pg-#{page.number}-md.png")
257
+ page.as_image(width: 1000).save("pg-#{page.number}-lg.png")
258
+
259
+ If the above page's #dimensions were 1000x1500, then the following images would be generated:
260
+ pg-1-sm.png -> 100x75
261
+ pg-1-md.png -> 500x750
262
+ pg-1-lg.png -> 750x1000
263
+ */
264
+ static VALUE
265
+ page_as_image(int argc, VALUE *argv, VALUE self)
266
+ {
267
+ CPDF_Page *page = RB2PG(self);
268
+
269
+ VALUE rb_options;
270
+ rb_scan_args(argc,argv,":", &rb_options);
271
+ if (NIL_P(rb_options)){
272
+ rb_options=rb_hash_new();
273
+ }
274
+ if ( TYPE(rb_options) != T_HASH ){
275
+ rb_raise(rb_eTypeError, "wrong argument type %s (expected Hash)", rb_obj_classname(rb_options));
276
+ }
277
+
278
+ VALUE width_option = rb_hash_aref(rb_options, rb_sym_width);
279
+ VALUE height_option = rb_hash_aref(rb_options, rb_sym_height);
280
+
281
+ int width = NIL_P(width_option) ? 0 : FIX2INT(width_option);
282
+ int height = NIL_P(height_option) ? 0 : FIX2INT(height_option);
283
+ if (!width && !height){
284
+ width = FPDF_GetPageWidth(page) * 2;
285
+ }
286
+
287
+ if (!width)
288
+ width = FPDF_GetPageWidth(page) * ( (double)height / FPDF_GetPageHeight(page) );
289
+ if (!height)
290
+ height = FPDF_GetPageHeight(page) * ( (double)width / FPDF_GetPageWidth(page) );
291
+
292
+ VALUE args[2];
293
+ args[0] = self;
294
+ VALUE img_options = args[1] = rb_hash_new();
295
+ rb_hash_aset(img_options, rb_sym_width, INT2FIX(width));
296
+ rb_hash_aset(img_options, rb_sym_height, INT2FIX(height));
297
+
298
+ VALUE bounds_args[4];
299
+ bounds_args[0] = rb_float_new( 0 );
300
+ bounds_args[1] = rb_float_new( FPDF_GetPageWidth(page) );
301
+ bounds_args[2] = rb_float_new( 0 );
302
+ bounds_args[3] = rb_float_new( FPDF_GetPageHeight(page) );
303
+ VALUE bounds = rb_class_new_instance( 4, bounds_args, RB::BoundingBox() );
304
+ rb_hash_aset(img_options, ID2SYM(rb_intern("bounds")), bounds);
305
+
306
+ return rb_class_new_instance( 2, args, RB::Image() );
307
+ }
308
+
309
+ /*
310
+ * call-seq:
311
+ * unload -> Page
312
+ *
313
+ * Frees a large portion of the internal memory allocated to the page.
314
+ * When a page is parsed by the PDFIum engine, various elements are cached in memory
315
+ * While Ruby will eventually garbage collect the Page instance once it's no longer
316
+ * in use, this method will free the memory immediatly. Page#unload is safe to use
317
+ * since the Page will re-load itself as needed, but calling it while the page
318
+ * is still in use will cause additional work by the engine since it will have to
319
+ * repeatedly re-parse the page when it re-loads itself.
320
+ *
321
+ * PageList#each will call this method on each page after it yields.
322
+ */
323
+ static VALUE
324
+ page_unload(VALUE self)
325
+ {
326
+ PageWrapper *pw;
327
+ Data_Get_Struct(self, PageWrapper, pw);
328
+ pw->unload();
329
+ return self;
330
+ }
331
+
332
+ // creates and yeilds an image. Not documented since all access
333
+ // should got through the ImageList interface via the Page#images method
334
+ /* :nodoc: */
335
+ static VALUE
336
+ page_each_image(VALUE self)
337
+ {
338
+ PageWrapper *pw;
339
+ Data_Get_Struct(self, PageWrapper, pw);
340
+
341
+ auto count = pw->page()->CountObjects();
342
+ int image_index=0;
343
+ for (int index=0; index < count; index++){
344
+ CPDF_PageObject *object = pw->page()->GetObjectByIndex(index);
345
+ if ( PDFPAGE_IMAGE == object->m_Type ){
346
+ VALUE args[2];
347
+ args[0] = self;
348
+ VALUE img_options = args[1] = rb_hash_new();
349
+
350
+ rb_hash_aset(img_options, ID2SYM(rb_intern("object_index")), INT2FIX(index));
351
+
352
+ rb_hash_aset(img_options, ID2SYM(rb_intern("index")), INT2FIX(image_index));
353
+
354
+ VALUE img = rb_class_new_instance( 2, args, RB::Image() );
355
+ rb_yield( img );
356
+ image_index++;
357
+ }
358
+ }
359
+ return self;
360
+ }
361
+
362
+
363
+ VALUE
364
+ define_page_class()
365
+ {
366
+ rb_sym_width = ID2SYM(rb_intern("width"));
367
+ rb_sym_height = ID2SYM(rb_intern("height"));
368
+
369
+ VALUE RB_PDFium = RB::PDFium();
370
+
371
+ // The Page class definition and methods
372
+ VALUE RB_Page = rb_define_class_under(RB_PDFium, "Page", rb_cObject);
373
+ //rb_define_alloc_func (RB_Page, page_allocate);
374
+ //rb_define_private_method (RB_Page, "initialize", RUBY_METHOD_FUNC(page_initialize), -1);
375
+
376
+ rb_define_singleton_method(RB_Page, "new", RUBY_METHOD_FUNC(page_new), 0);
377
+ rb_define_singleton_method(RB_Page, "open", RUBY_METHOD_FUNC(page_open), 2);
378
+ rb_define_singleton_method(RB_Page, "create", RUBY_METHOD_FUNC(page_create), -1);
379
+
380
+ rb_define_method (RB_Page, "text", RUBY_METHOD_FUNC(page_text), 0);
381
+ rb_define_method (RB_Page, "width", RUBY_METHOD_FUNC(page_width), 0);
382
+ rb_define_method (RB_Page, "height", RUBY_METHOD_FUNC(page_height), 0);
383
+ rb_define_method (RB_Page, "as_image", RUBY_METHOD_FUNC(page_as_image), -1);
384
+ rb_define_method (RB_Page, "unload", RUBY_METHOD_FUNC(page_unload), 0);
385
+ rb_define_method (RB_Page, "number", RUBY_METHOD_FUNC(page_number), 0);
386
+
387
+ rb_define_method (RB_Page, "images", RUBY_METHOD_FUNC(page_images), 0);
388
+
389
+ rb_define_method (RB_Page, "each_image", RUBY_METHOD_FUNC(page_each_image), 0);
390
+
391
+ return RB_Page;
392
+ }
@@ -0,0 +1,5 @@
1
+ #ifndef __PAGE_H__
2
+ #define __PAGE_H__
3
+
4
+
5
+ #endif // __PAGE_H__
@@ -0,0 +1,38 @@
1
+ #include "pdfium.h"
2
+ #include "page_wrapper.h"
3
+
4
+ PageObjectWrapper::PageObjectWrapper():
5
+ page_wrapper(0),
6
+ object(0),
7
+ page_object_index(-1)
8
+ {}
9
+
10
+ PageObjectWrapper::~PageObjectWrapper(){
11
+ if (page_wrapper)
12
+ page_wrapper->release(this);
13
+ if (object){
14
+ // object->Release();
15
+ }
16
+ }
17
+
18
+ void
19
+ PageObjectWrapper::wrap(CPDF_PageObject *obj, PageWrapper *pg){
20
+ this->object = obj;
21
+ this->page_wrapper = pg;
22
+ this->page_wrapper->retain(this);
23
+ }
24
+
25
+
26
+
27
+ CPDF_ImageObject*
28
+ RB2IMG(VALUE self) {
29
+ PageObjectWrapper* po;
30
+ Data_Get_Struct(self, PageObjectWrapper, po);
31
+ return static_cast<CPDF_ImageObject*>(po->object);
32
+ }
33
+
34
+ void
35
+ ImageWrapper::wrap(PageWrapper *page_wrapper){
36
+ this->page_wrapper = page_wrapper;
37
+ this->page_wrapper->retain(this);
38
+ }
@@ -0,0 +1,27 @@
1
+ #ifndef __PAGE_OBJECT_WRAPPER_H__
2
+ #define __PAGE_OBJECT_WRAPPER_H__
3
+
4
+ class PageWrapper;
5
+
6
+ class PageObjectWrapper {
7
+ public:
8
+ PageObjectWrapper();
9
+ ~PageObjectWrapper();
10
+
11
+ void wrap(CPDF_PageObject *object, PageWrapper *page_wrapper);
12
+
13
+ PageWrapper *page_wrapper;
14
+ CPDF_PageObject *object;
15
+ int page_object_index;
16
+ };
17
+
18
+
19
+ class ImageWrapper : public PageObjectWrapper {
20
+ public:
21
+
22
+ void wrap(PageWrapper *page_wrapper);
23
+
24
+ };
25
+
26
+
27
+ #endif // __PAGE_OBJECT_WRAPPER_H__