gocr-ruby 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 18b9142ede00b0c09bc41a71caa18a5090ad44e2
4
- data.tar.gz: b72f4aed15d04583f8330f0f3bccebc0a9649f9e
3
+ metadata.gz: 1d257eb31f2bf08f056a30e19597da354ced4147
4
+ data.tar.gz: b4ed2b93a9fd6aeedf31e874bfec01b14f40b56c
5
5
  SHA512:
6
- metadata.gz: 8ae9ddb96adb3ddf550e59347f597927df70e17df633e9bbb24e0d45b534bdd43bb5075dd3ada115fea4343cefe0f67ccdb53c209a5dd94d925a537ffd03b004
7
- data.tar.gz: 260c67406fb4af5240939c6b16aea210667c907c0ee36ed13a7b94b691485c2ddfada467eb82ea73a4b3779a68e82213f035f0df40ef293177921f08e634771a
6
+ metadata.gz: 8df4f786fbc95987f1cc37c722af93e768d3c71fd53ff108a23184aca8bfe2e1cb1c1b37d0647785ac1267d09ca3d863ea8dba7d9c3e6673b851808e6611622f
7
+ data.tar.gz: b7579da8c2e8f2fa1e385c0afbf484a105ecf7368b0727a55cdfff31c9451ef2d9223886a18580c6af647a53a621a5b4496a2147f851acd025699233e13223a0
data/.gitignore CHANGED
@@ -19,4 +19,6 @@ tmp
19
19
  *.so
20
20
  .rvmrc
21
21
  .idea/
22
- *.log
22
+ *.log
23
+
24
+ GOCR-Ruby.*
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # gocr-ruby is a gocr binding for ruby
2
2
 
3
- TODO: Write a gem description
3
+ Ruby GOCR binding for lib http://jocr.sourceforge.net/
4
+
5
+ Currently it is in progress, unstable version.
4
6
 
5
7
  ## Installation
6
8
 
@@ -18,7 +20,25 @@ Or install it yourself as:
18
20
 
19
21
  ## Usage
20
22
 
21
- TODO: Write usage instructions here
23
+ current api almost as a ruby-tesseract-ocr
24
+
25
+ ```ruby
26
+ require 'gocr'
27
+ e = GOCR::Engine.new(whitelist: '0-9').text_for('image.png')
28
+ ```
29
+
30
+ ### Available options
31
+
32
+ * `:whitelist` - char filter (ex. hexdigits: ""0-9A-Fx"", only ASCII)
33
+ * `:database` - database path including final slash (default is ./db/)
34
+ * `:format` - output format (ISO8859_1 TeX HTML XML UTF8 ASCII)
35
+ * `:gray_level` - threshold grey level 0<160<=255 (0 = autodetect)
36
+ * `:numbers_only` - numbers only
37
+ * `:mode` operation modes (bitpattern, see official gocr manual)
38
+ * `:certainty` - value of certainty (in percent, 0..100, default=95)
39
+ * `:unrecognize_char` - output this string for every unrecognized character
40
+ * `:dust_size` - dust_size (remove small clusters, -1 = autodetect)
41
+ * `:space_width` - spacewidth/dots (0 = autodetect)
22
42
 
23
43
  ## Contributing
24
44
 
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require 'rubygems/package_task'
9
9
  #
10
10
  # See https://github.com/luislavena/rake-compiler for details
11
11
 
12
- Rake::ExtensionTask.new 'gocr-ruby' do |ext|
12
+ Rake::ExtensionTask.new 'gocr' do |ext|
13
13
 
14
14
  # This causes the shared object to be placed in lib/my_malloc/my_malloc.so
15
15
  #
@@ -19,7 +19,7 @@ Rake::ExtensionTask.new 'gocr-ruby' do |ext|
19
19
  ext.lib_dir = 'lib/gocr'
20
20
  end
21
21
 
22
- s = Gem::Specification.new 'gocr-ruby', '0.0.1' do |s|
22
+ s = Gem::Specification.new 'gocr', '0.0.1' do |s|
23
23
  s.summary = 'simple gocr wrapper'
24
24
  s.authors = %w[zyablitskiy@gmail.com]
25
25
 
@@ -40,7 +40,7 @@ Gem::PackageTask.new s do end
40
40
  # This isn't a good test, but does provide a sanity check
41
41
 
42
42
  task test: %w[compile] do
43
- ruby '-Ilib', '-rgocr', '-e', 'p GOCR::Image.recognize("image.png")'
43
+ ruby '-Ilib', '-rgocr', '-e', 'p GOCR::Engine.new(numbers_only: false).text_for("image.png")'
44
44
  # ruby '-Ilib', '-rgocr', '-e', 'p 3'
45
45
  end
46
46
 
@@ -42,7 +42,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
42
42
  #include "ocr0.h" /* only_numbers */
43
43
  #include "progress.h"
44
44
  #include "version.h"
45
- #include "ruby.h"
45
+ #include <ruby.h>
46
46
 
47
47
  /*
48
48
  #ifndef RSTRING_PTR
@@ -322,9 +322,12 @@ static int read_picture(job_t *job) {
322
322
  }
323
323
 
324
324
  /* subject of change, we need more output for XML (ToDo) */
325
- void print_output(job_t *job) {
325
+ char* print_output(job_t *job) {
326
326
  int linecounter = 0;
327
327
  const char *line;
328
+ char** lines;
329
+ char* output;
330
+ int i = 0, j, n = 0;
328
331
 
329
332
  assert(job);
330
333
 
@@ -332,61 +335,131 @@ void print_output(job_t *job) {
332
335
  simplify code 2010-09-26
333
336
  */
334
337
  linecounter = 0;
338
+ lines = (char**) malloc(job->res.lines.num);
335
339
  line = getTextLine(&(job->res.linelist), linecounter++);
336
340
  while (line) {
341
+ n += strlen(line) + 1;
342
+ lines[i] = (char*) malloc(strlen(line));
343
+ strcpy(lines[i++], line);
337
344
  /* notice: decode() is shiftet to getTextLine since 0.38 */
338
- fputs(line, stdout);
339
345
  if (job->cfg.out_format==HTML) fputs("<br />",stdout);
340
346
  if (job->cfg.out_format!=XML) fputc('\n', stdout);
341
347
  line = getTextLine(&(job->res.linelist), linecounter++);
342
348
  }
343
- free_textlines(&(job->res.linelist));
349
+ // free_textlines(&(job->res.linelist));
350
+
351
+ output = (char*) malloc(n);
352
+ strcpy(output, lines[0]);
353
+ for(j = 1; j < i; j++) {
354
+ strcat(output, "\n");
355
+ strcat(output, lines[j]);
356
+ }
357
+
358
+ return output;
344
359
  }
345
360
 
346
361
  /* FIXME jb: remove JOB; renamed to OCR_JOB 2010-09-26 */
347
362
  job_t *OCR_JOB;
348
363
 
349
364
 
350
- char* gocr_recognize(char* filename) {
351
- char* line;
352
- int multipnm=1;
353
- job_t job1, *job; /* fixme, dont want global variables for lib */
354
- job=OCR_JOB=&job1;
355
-
356
- setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
365
+ char* gocr_main(job_t* job) {
366
+ int multipnm=1;
367
+ char* output;
368
+ setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
357
369
 
358
- job_init(job); /* init cfg and db */
359
- job->src.fname = filename;
360
- /* load character data base (JS1002: now outside pgm2asc) */
361
- if ( job->cfg.mode & 2 ) /* check for db-option flag */
362
- load_db(job);
370
+ /* load character data base (JS1002: now outside pgm2asc) */
371
+ if ( job->cfg.mode & 2 ) /* check for db-option flag */
372
+ load_db(job);
363
373
  /* load_db uses readpnm() and would conflict with multi images */
364
374
 
375
+ while (multipnm==1) { /* multi-image loop */
376
+
365
377
  job_init_image(job); /* single image */
378
+
366
379
  mark_start(job);
367
- read_picture(job);
368
380
 
381
+ multipnm = read_picture(job);
382
+ /* separation of main and rest for using as lib
383
+ this will be changed later => introduction of set_option()
384
+ for better communication to the engine */
385
+ if (multipnm<0) break; /* read error */
386
+
387
+ /* call main loop */
369
388
  pgm2asc(job);
389
+
370
390
  mark_end(job);
371
- line = getTextLine(&(job->res.linelist), 0);
391
+
392
+ output = print_output(job);
393
+
372
394
  job_free_image(job);
373
- return line;
395
+
396
+ }
397
+
398
+ // return ((multipnm<0)?multipnm:0); /* -1=255 on error, 0 ok */
399
+ return output;
374
400
  }
375
401
 
376
402
 
377
403
  static VALUE image_recognize(VALUE self, VALUE arg) {
378
- char* filename = StringValuePtr(arg);
379
- return rb_str_new2( gocr_recognize(filename) );
404
+ VALUE tmp;
405
+
406
+ job_t job1, *job; /* fixme, dont want global variables for lib */
407
+ job=OCR_JOB=&job1;
408
+ job_init(job);
409
+
410
+ job->src.fname = StringValuePtr(arg);
411
+ tmp = rb_iv_get(self, "@database");
412
+ if (tmp != Qnil) {
413
+ job->cfg.db_path = StringValuePtr(tmp);
414
+ }
415
+ tmp = rb_iv_get(self, "@format");
416
+ if (tmp != Qnil) {
417
+ job->cfg.out_format = NUM2INT(tmp);
418
+ }
419
+ tmp = rb_iv_get(self, "@whitelist");
420
+ if (tmp != Qnil) {
421
+ if (strlen(StringValuePtr(tmp)) > 0)
422
+ job->cfg.cfilter = StringValuePtr(tmp);
423
+ }
424
+ tmp = rb_iv_get(self, "@dust_size");
425
+ if (tmp != Qnil) {
426
+ job->cfg.dust_size = NUM2INT(tmp);
427
+ }
428
+ tmp = rb_iv_get(self, "@gray_level");
429
+ if (tmp != Qnil) {
430
+ job->cfg.cs = NUM2INT(tmp);
431
+ }
432
+ tmp = rb_iv_get(self, "@space_width");
433
+ if (tmp != Qnil) {
434
+ job->cfg.spc = NUM2INT(tmp);
435
+ }
436
+ tmp = rb_iv_get(self, "@mode");
437
+ if (tmp != Qnil) {
438
+ job->cfg.mode |= NUM2INT(tmp);
439
+ }
440
+ tmp = rb_iv_get(self, "@numbers_only");
441
+ if (tmp == Qtrue) {
442
+ job->cfg.only_numbers = 1;
443
+ }
444
+ tmp = rb_iv_get(self, "@certainty");
445
+ if (tmp != Qnil) {
446
+ job->cfg.certainty = NUM2INT(tmp);
447
+ }
448
+ tmp = rb_iv_get(self, "@unrecognize_char");
449
+ if (tmp != Qnil) {
450
+ job->cfg.unrec_marker = StringValuePtr(tmp)[0];
451
+ }
452
+ return rb_str_new2( gocr_main(job) );
380
453
  }
381
454
 
382
455
  /*
383
- * @brief define ruby class GOCR::Image and method recognize
456
+ * @brief define ruby class GOCR::Engine and method text_for
384
457
  *
385
458
  */
386
459
  void Init_gocr() {
387
460
  VALUE mGocr = rb_define_module("GOCR");
388
- VALUE mImage = rb_define_class_under(mGocr, "Image", rb_cObject);
389
- rb_define_singleton_method(mImage, "recognize", image_recognize, 1);
461
+ VALUE mImage = rb_define_class_under(mGocr, "Engine", rb_cObject);
462
+ rb_define_method(mImage, "text_for", image_recognize, 1);
390
463
  }
391
464
 
392
465
 
@@ -433,4 +506,4 @@ void Init_gocr() {
433
506
  // }
434
507
  //
435
508
  // return ((multipnm<0)?multipnm:0); /* -1=255 on error, 0 ok */
436
- //}
509
+ //}
@@ -44,7 +44,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
44
44
  #include <stddef.h>
45
45
 
46
46
  enum format {
47
- ISO8859_1, TeX, HTML, XML, SGML, UTF8, ASCII
47
+ UTF8, ISO8859_1, TeX, HTML, XML, SGML, ASCII
48
48
  };
49
49
  typedef enum format FORMAT;
50
50
 
data/image.png CHANGED
Binary file
@@ -1,5 +1,5 @@
1
1
  require "gocr/version"
2
- require "gocr/image"
2
+ require "gocr/engine"
3
3
 
4
4
  module GOCR
5
5
  # Your code goes here...
@@ -0,0 +1,20 @@
1
+ require "gocr/gocr"
2
+
3
+ module GOCR
4
+ class Engine
5
+ attr_accessor :whitelist, :blacklist, :database, :format, :gray_level, :numbers_only,
6
+ :mode, :certainty, :unrecognize_char, :dust_size, :space_width
7
+
8
+ FORMATS = Hash[%w(UTF8 ISO8859_1 TeX HTML XML ASCII).map.with_index.to_a].freeze
9
+
10
+ def initialize(options={})
11
+ options.each do |k, v|
12
+ send("#{k}=", v) if respond_to?(k)
13
+ end
14
+ @format = FORMATS[format].to_i
15
+ @dust_size = -1 if dust_size.nil?
16
+ @unrecognize_char = @unrecognize_char[0] unless unrecognize_char.nil?
17
+ end
18
+
19
+ end
20
+ end
@@ -1,3 +1,3 @@
1
1
  module GOCR
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gocr-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vladimir Zyablitskiy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2014-03-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -126,7 +126,7 @@ files:
126
126
  - gocr-ruby.gemspec
127
127
  - image.png
128
128
  - lib/gocr.rb
129
- - lib/gocr/image.rb
129
+ - lib/gocr/engine.rb
130
130
  - lib/gocr/version.rb
131
131
  homepage: https://github.com/rainlabs/gocr-ruby
132
132
  licenses:
@@ -1,8 +0,0 @@
1
- require "gocr/gocr"
2
-
3
- module GOCR
4
- class Image
5
-
6
- end
7
- end
8
-