gocr-ruby 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 18b9142ede00b0c09bc41a71caa18a5090ad44e2
4
- data.tar.gz: b72f4aed15d04583f8330f0f3bccebc0a9649f9e
3
+ metadata.gz: 1d257eb31f2bf08f056a30e19597da354ced4147
4
+ data.tar.gz: b4ed2b93a9fd6aeedf31e874bfec01b14f40b56c
5
5
  SHA512:
6
- metadata.gz: 8ae9ddb96adb3ddf550e59347f597927df70e17df633e9bbb24e0d45b534bdd43bb5075dd3ada115fea4343cefe0f67ccdb53c209a5dd94d925a537ffd03b004
7
- data.tar.gz: 260c67406fb4af5240939c6b16aea210667c907c0ee36ed13a7b94b691485c2ddfada467eb82ea73a4b3779a68e82213f035f0df40ef293177921f08e634771a
6
+ metadata.gz: 8df4f786fbc95987f1cc37c722af93e768d3c71fd53ff108a23184aca8bfe2e1cb1c1b37d0647785ac1267d09ca3d863ea8dba7d9c3e6673b851808e6611622f
7
+ data.tar.gz: b7579da8c2e8f2fa1e385c0afbf484a105ecf7368b0727a55cdfff31c9451ef2d9223886a18580c6af647a53a621a5b4496a2147f851acd025699233e13223a0
data/.gitignore CHANGED
@@ -19,4 +19,6 @@ tmp
19
19
  *.so
20
20
  .rvmrc
21
21
  .idea/
22
- *.log
22
+ *.log
23
+
24
+ GOCR-Ruby.*
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # gocr-ruby is a gocr binding for ruby
2
2
 
3
- TODO: Write a gem description
3
+ Ruby GOCR binding for lib http://jocr.sourceforge.net/
4
+
5
+ Currently it is in progress, unstable version.
4
6
 
5
7
  ## Installation
6
8
 
@@ -18,7 +20,25 @@ Or install it yourself as:
18
20
 
19
21
  ## Usage
20
22
 
21
- TODO: Write usage instructions here
23
+ current api almost as a ruby-tesseract-ocr
24
+
25
+ ```ruby
26
+ require 'gocr'
27
+ e = GOCR::Engine.new(whitelist: '0-9').text_for('image.png')
28
+ ```
29
+
30
+ ### Available options
31
+
32
+ * `:whitelist` - char filter (ex. hexdigits: ""0-9A-Fx"", only ASCII)
33
+ * `:database` - database path including final slash (default is ./db/)
34
+ * `:format` - output format (ISO8859_1 TeX HTML XML UTF8 ASCII)
35
+ * `:gray_level` - threshold grey level 0<160<=255 (0 = autodetect)
36
+ * `:numbers_only` - numbers only
37
+ * `:mode` operation modes (bitpattern, see official gocr manual)
38
+ * `:certainty` - value of certainty (in percent, 0..100, default=95)
39
+ * `:unrecognize_char` - output this string for every unrecognized character
40
+ * `:dust_size` - dust_size (remove small clusters, -1 = autodetect)
41
+ * `:space_width` - spacewidth/dots (0 = autodetect)
22
42
 
23
43
  ## Contributing
24
44
 
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require 'rubygems/package_task'
9
9
  #
10
10
  # See https://github.com/luislavena/rake-compiler for details
11
11
 
12
- Rake::ExtensionTask.new 'gocr-ruby' do |ext|
12
+ Rake::ExtensionTask.new 'gocr' do |ext|
13
13
 
14
14
  # This causes the shared object to be placed in lib/my_malloc/my_malloc.so
15
15
  #
@@ -19,7 +19,7 @@ Rake::ExtensionTask.new 'gocr-ruby' do |ext|
19
19
  ext.lib_dir = 'lib/gocr'
20
20
  end
21
21
 
22
- s = Gem::Specification.new 'gocr-ruby', '0.0.1' do |s|
22
+ s = Gem::Specification.new 'gocr', '0.0.1' do |s|
23
23
  s.summary = 'simple gocr wrapper'
24
24
  s.authors = %w[zyablitskiy@gmail.com]
25
25
 
@@ -40,7 +40,7 @@ Gem::PackageTask.new s do end
40
40
  # This isn't a good test, but does provide a sanity check
41
41
 
42
42
  task test: %w[compile] do
43
- ruby '-Ilib', '-rgocr', '-e', 'p GOCR::Image.recognize("image.png")'
43
+ ruby '-Ilib', '-rgocr', '-e', 'p GOCR::Engine.new(numbers_only: false).text_for("image.png")'
44
44
  # ruby '-Ilib', '-rgocr', '-e', 'p 3'
45
45
  end
46
46
 
@@ -42,7 +42,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
42
42
  #include "ocr0.h" /* only_numbers */
43
43
  #include "progress.h"
44
44
  #include "version.h"
45
- #include "ruby.h"
45
+ #include <ruby.h>
46
46
 
47
47
  /*
48
48
  #ifndef RSTRING_PTR
@@ -322,9 +322,12 @@ static int read_picture(job_t *job) {
322
322
  }
323
323
 
324
324
  /* subject of change, we need more output for XML (ToDo) */
325
- void print_output(job_t *job) {
325
+ char* print_output(job_t *job) {
326
326
  int linecounter = 0;
327
327
  const char *line;
328
+ char** lines;
329
+ char* output;
330
+ int i = 0, j, n = 0;
328
331
 
329
332
  assert(job);
330
333
 
@@ -332,61 +335,131 @@ void print_output(job_t *job) {
332
335
  simplify code 2010-09-26
333
336
  */
334
337
  linecounter = 0;
338
+ lines = (char**) malloc(job->res.lines.num);
335
339
  line = getTextLine(&(job->res.linelist), linecounter++);
336
340
  while (line) {
341
+ n += strlen(line) + 1;
342
+ lines[i] = (char*) malloc(strlen(line));
343
+ strcpy(lines[i++], line);
337
344
  /* notice: decode() is shiftet to getTextLine since 0.38 */
338
- fputs(line, stdout);
339
345
  if (job->cfg.out_format==HTML) fputs("<br />",stdout);
340
346
  if (job->cfg.out_format!=XML) fputc('\n', stdout);
341
347
  line = getTextLine(&(job->res.linelist), linecounter++);
342
348
  }
343
- free_textlines(&(job->res.linelist));
349
+ // free_textlines(&(job->res.linelist));
350
+
351
+ output = (char*) malloc(n);
352
+ strcpy(output, lines[0]);
353
+ for(j = 1; j < i; j++) {
354
+ strcat(output, "\n");
355
+ strcat(output, lines[j]);
356
+ }
357
+
358
+ return output;
344
359
  }
345
360
 
346
361
  /* FIXME jb: remove JOB; renamed to OCR_JOB 2010-09-26 */
347
362
  job_t *OCR_JOB;
348
363
 
349
364
 
350
- char* gocr_recognize(char* filename) {
351
- char* line;
352
- int multipnm=1;
353
- job_t job1, *job; /* fixme, dont want global variables for lib */
354
- job=OCR_JOB=&job1;
355
-
356
- setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
365
+ char* gocr_main(job_t* job) {
366
+ int multipnm=1;
367
+ char* output;
368
+ setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
357
369
 
358
- job_init(job); /* init cfg and db */
359
- job->src.fname = filename;
360
- /* load character data base (JS1002: now outside pgm2asc) */
361
- if ( job->cfg.mode & 2 ) /* check for db-option flag */
362
- load_db(job);
370
+ /* load character data base (JS1002: now outside pgm2asc) */
371
+ if ( job->cfg.mode & 2 ) /* check for db-option flag */
372
+ load_db(job);
363
373
  /* load_db uses readpnm() and would conflict with multi images */
364
374
 
375
+ while (multipnm==1) { /* multi-image loop */
376
+
365
377
  job_init_image(job); /* single image */
378
+
366
379
  mark_start(job);
367
- read_picture(job);
368
380
 
381
+ multipnm = read_picture(job);
382
+ /* separation of main and rest for using as lib
383
+ this will be changed later => introduction of set_option()
384
+ for better communication to the engine */
385
+ if (multipnm<0) break; /* read error */
386
+
387
+ /* call main loop */
369
388
  pgm2asc(job);
389
+
370
390
  mark_end(job);
371
- line = getTextLine(&(job->res.linelist), 0);
391
+
392
+ output = print_output(job);
393
+
372
394
  job_free_image(job);
373
- return line;
395
+
396
+ }
397
+
398
+ // return ((multipnm<0)?multipnm:0); /* -1=255 on error, 0 ok */
399
+ return output;
374
400
  }
375
401
 
376
402
 
377
403
  static VALUE image_recognize(VALUE self, VALUE arg) {
378
- char* filename = StringValuePtr(arg);
379
- return rb_str_new2( gocr_recognize(filename) );
404
+ VALUE tmp;
405
+
406
+ job_t job1, *job; /* fixme, dont want global variables for lib */
407
+ job=OCR_JOB=&job1;
408
+ job_init(job);
409
+
410
+ job->src.fname = StringValuePtr(arg);
411
+ tmp = rb_iv_get(self, "@database");
412
+ if (tmp != Qnil) {
413
+ job->cfg.db_path = StringValuePtr(tmp);
414
+ }
415
+ tmp = rb_iv_get(self, "@format");
416
+ if (tmp != Qnil) {
417
+ job->cfg.out_format = NUM2INT(tmp);
418
+ }
419
+ tmp = rb_iv_get(self, "@whitelist");
420
+ if (tmp != Qnil) {
421
+ if (strlen(StringValuePtr(tmp)) > 0)
422
+ job->cfg.cfilter = StringValuePtr(tmp);
423
+ }
424
+ tmp = rb_iv_get(self, "@dust_size");
425
+ if (tmp != Qnil) {
426
+ job->cfg.dust_size = NUM2INT(tmp);
427
+ }
428
+ tmp = rb_iv_get(self, "@gray_level");
429
+ if (tmp != Qnil) {
430
+ job->cfg.cs = NUM2INT(tmp);
431
+ }
432
+ tmp = rb_iv_get(self, "@space_width");
433
+ if (tmp != Qnil) {
434
+ job->cfg.spc = NUM2INT(tmp);
435
+ }
436
+ tmp = rb_iv_get(self, "@mode");
437
+ if (tmp != Qnil) {
438
+ job->cfg.mode |= NUM2INT(tmp);
439
+ }
440
+ tmp = rb_iv_get(self, "@numbers_only");
441
+ if (tmp == Qtrue) {
442
+ job->cfg.only_numbers = 1;
443
+ }
444
+ tmp = rb_iv_get(self, "@certainty");
445
+ if (tmp != Qnil) {
446
+ job->cfg.certainty = NUM2INT(tmp);
447
+ }
448
+ tmp = rb_iv_get(self, "@unrecognize_char");
449
+ if (tmp != Qnil) {
450
+ job->cfg.unrec_marker = StringValuePtr(tmp)[0];
451
+ }
452
+ return rb_str_new2( gocr_main(job) );
380
453
  }
381
454
 
382
455
  /*
383
- * @brief define ruby class GOCR::Image and method recognize
456
+ * @brief define ruby class GOCR::Engine and method text_for
384
457
  *
385
458
  */
386
459
  void Init_gocr() {
387
460
  VALUE mGocr = rb_define_module("GOCR");
388
- VALUE mImage = rb_define_class_under(mGocr, "Image", rb_cObject);
389
- rb_define_singleton_method(mImage, "recognize", image_recognize, 1);
461
+ VALUE mImage = rb_define_class_under(mGocr, "Engine", rb_cObject);
462
+ rb_define_method(mImage, "text_for", image_recognize, 1);
390
463
  }
391
464
 
392
465
 
@@ -433,4 +506,4 @@ void Init_gocr() {
433
506
  // }
434
507
  //
435
508
  // return ((multipnm<0)?multipnm:0); /* -1=255 on error, 0 ok */
436
- //}
509
+ //}
@@ -44,7 +44,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
44
44
  #include <stddef.h>
45
45
 
46
46
  enum format {
47
- ISO8859_1, TeX, HTML, XML, SGML, UTF8, ASCII
47
+ UTF8, ISO8859_1, TeX, HTML, XML, SGML, ASCII
48
48
  };
49
49
  typedef enum format FORMAT;
50
50
 
data/image.png CHANGED
Binary file
@@ -1,5 +1,5 @@
1
1
  require "gocr/version"
2
- require "gocr/image"
2
+ require "gocr/engine"
3
3
 
4
4
  module GOCR
5
5
  # Your code goes here...
@@ -0,0 +1,20 @@
1
+ require "gocr/gocr"
2
+
3
+ module GOCR
4
+ class Engine
5
+ attr_accessor :whitelist, :blacklist, :database, :format, :gray_level, :numbers_only,
6
+ :mode, :certainty, :unrecognize_char, :dust_size, :space_width
7
+
8
+ FORMATS = Hash[%w(UTF8 ISO8859_1 TeX HTML XML ASCII).map.with_index.to_a].freeze
9
+
10
+ def initialize(options={})
11
+ options.each do |k, v|
12
+ send("#{k}=", v) if respond_to?(k)
13
+ end
14
+ @format = FORMATS[format].to_i
15
+ @dust_size = -1 if dust_size.nil?
16
+ @unrecognize_char = @unrecognize_char[0] unless unrecognize_char.nil?
17
+ end
18
+
19
+ end
20
+ end
@@ -1,3 +1,3 @@
1
1
  module GOCR
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gocr-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vladimir Zyablitskiy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2014-03-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -126,7 +126,7 @@ files:
126
126
  - gocr-ruby.gemspec
127
127
  - image.png
128
128
  - lib/gocr.rb
129
- - lib/gocr/image.rb
129
+ - lib/gocr/engine.rb
130
130
  - lib/gocr/version.rb
131
131
  homepage: https://github.com/rainlabs/gocr-ruby
132
132
  licenses:
@@ -1,8 +0,0 @@
1
- require "gocr/gocr"
2
-
3
- module GOCR
4
- class Image
5
-
6
- end
7
- end
8
-