simple_tesseract 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/extconf.rb +6 -0
- data/ext/simple_tesseract_ext.cpp +41 -0
- data/lib/tesseract.rb +72 -0
- metadata +78 -0
data/ext/extconf.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby/ruby.h>
|
3
|
+
#include <tiffio.h>
|
4
|
+
#include <tesseract/img.h>
|
5
|
+
#include <tesseract/imgs.h>
|
6
|
+
#include <tesseract/baseapi.h>
|
7
|
+
|
8
|
+
void read_tiff_image(TIFF* tif, IMAGE* image);
|
9
|
+
|
10
|
+
extern "C" {
|
11
|
+
VALUE rb_cTesseract;
|
12
|
+
|
13
|
+
VALUE
|
14
|
+
rb_cTesseract_get_text(VALUE self, VALUE language, VALUE file,
|
15
|
+
VALUE rx, VALUE ry, VALUE rwidth, VALUE rheight) {
|
16
|
+
const char *lang = (const char *)(language == Qnil ? NULL : RSTRING_PTR(language));
|
17
|
+
TIFF * tif = TIFFOpen(RSTRING_PTR(file), "r");
|
18
|
+
int x = NUM2INT(rx),
|
19
|
+
y = NUM2INT(ry),
|
20
|
+
width = NUM2INT(rwidth),
|
21
|
+
height = NUM2INT(rheight);
|
22
|
+
IMAGE img;
|
23
|
+
read_tiff_image(tif, &img);
|
24
|
+
int bytes_per_line = check_legal_image_size(img.get_xsize(),
|
25
|
+
img.get_ysize(), img.get_bpp());
|
26
|
+
|
27
|
+
TessBaseAPI::InitWithLanguage(NULL, NULL, lang, NULL, false, 0, NULL);
|
28
|
+
char *text = TessBaseAPI::TesseractRect(img.get_buffer(), img.get_bpp()/8,
|
29
|
+
bytes_per_line, x, y, width, height);
|
30
|
+
TessBaseAPI::End();
|
31
|
+
TIFFClose(tif);
|
32
|
+
|
33
|
+
return rb_str_new2(text);
|
34
|
+
}
|
35
|
+
|
36
|
+
void
|
37
|
+
Init_simple_tesseract_ext() {
|
38
|
+
rb_cTesseract = rb_define_class("Tesseract", rb_cObject);
|
39
|
+
rb_define_private_method(rb_cTesseract, "get_text", (VALUE (*)(...))rb_cTesseract_get_text, 6);
|
40
|
+
}
|
41
|
+
}
|
data/lib/tesseract.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'RMagick'
|
2
|
+
require 'simple_tesseract_ext'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
class Tesseract
|
6
|
+
attr_reader :src, :blob
|
7
|
+
attr_accessor :lang, :editor
|
8
|
+
|
9
|
+
def initialize (opts={})
|
10
|
+
@lang = opts.delete(:lang) || opts.delete(:language) || 'eng'
|
11
|
+
self.src = opts.delete(:src) || opts.delete(:source) || opts.delete(:image)
|
12
|
+
self.blob = opts.delete(:blob)
|
13
|
+
@editor = opts.delete(:editor) || lambda {|x|x}
|
14
|
+
self.strip = opts.delete(:strip)
|
15
|
+
@tmp = Tempfile.new(['rbtesseract', '.tiff']).tap {|x| x.close }.path
|
16
|
+
|
17
|
+
ObjectSpace.define_finalizer(self, method(:finalize))
|
18
|
+
end
|
19
|
+
|
20
|
+
def src= (file)
|
21
|
+
@blob = nil if file
|
22
|
+
@src = file
|
23
|
+
end
|
24
|
+
|
25
|
+
def blob= (string)
|
26
|
+
@src = nil if string
|
27
|
+
@blob = string
|
28
|
+
end
|
29
|
+
|
30
|
+
def strip= (bool)
|
31
|
+
@strip = !!bool
|
32
|
+
end
|
33
|
+
|
34
|
+
def strip?
|
35
|
+
@strip
|
36
|
+
end
|
37
|
+
|
38
|
+
alias language lang
|
39
|
+
alias language= lang=
|
40
|
+
alias source src
|
41
|
+
alias source= src=
|
42
|
+
alias image src
|
43
|
+
alias image= src=
|
44
|
+
|
45
|
+
def solve (x=0, y=0, width=nil, height=nil)
|
46
|
+
editor.call((@src ? Magick::Image.read(@src) : Magick::Image.from_blob(@blob)).first).write(@tmp)
|
47
|
+
img = Magick::Image.read(@tmp).first
|
48
|
+
x ||= 0
|
49
|
+
y ||= 0
|
50
|
+
width ||= img.columns
|
51
|
+
height ||= img.rows
|
52
|
+
|
53
|
+
get_text(@lang, @tmp, x, y, width, height).tap {|x|
|
54
|
+
x.strip! if strip?
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def crops (*areas)
|
59
|
+
areas.map {|area|
|
60
|
+
solve(*area)
|
61
|
+
}.join
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s
|
65
|
+
solve
|
66
|
+
end
|
67
|
+
|
68
|
+
def finalize
|
69
|
+
File.unlink(@tmp) rescue nil
|
70
|
+
end
|
71
|
+
alias close finalize
|
72
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simple_tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- shura
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-09-06 00:00:00 +02:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rmagick
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
description: tesseract ruby bindings
|
34
|
+
email: shura1991@gmail.com
|
35
|
+
executables: []
|
36
|
+
|
37
|
+
extensions:
|
38
|
+
- ext/extconf.rb
|
39
|
+
extra_rdoc_files: []
|
40
|
+
|
41
|
+
files:
|
42
|
+
- lib/tesseract.rb
|
43
|
+
- ext/simple_tesseract_ext.cpp
|
44
|
+
- ext/extconf.rb
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/shurizzle/simple_tesseract
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: tesseract ruby bindings
|
77
|
+
test_files: []
|
78
|
+
|