simple_tesseract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/extconf.rb +6 -0
- data/ext/simple_tesseract_ext.cpp +41 -0
- data/lib/tesseract.rb +72 -0
- metadata +78 -0
data/ext/extconf.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ruby/ruby.h>
|
3
|
+
#include <tiffio.h>
|
4
|
+
#include <tesseract/img.h>
|
5
|
+
#include <tesseract/imgs.h>
|
6
|
+
#include <tesseract/baseapi.h>
|
7
|
+
|
8
|
+
void read_tiff_image(TIFF* tif, IMAGE* image);
|
9
|
+
|
10
|
+
extern "C" {
|
11
|
+
VALUE rb_cTesseract;
|
12
|
+
|
13
|
+
VALUE
|
14
|
+
rb_cTesseract_get_text(VALUE self, VALUE language, VALUE file,
|
15
|
+
VALUE rx, VALUE ry, VALUE rwidth, VALUE rheight) {
|
16
|
+
const char *lang = (const char *)(language == Qnil ? NULL : RSTRING_PTR(language));
|
17
|
+
TIFF * tif = TIFFOpen(RSTRING_PTR(file), "r");
|
18
|
+
int x = NUM2INT(rx),
|
19
|
+
y = NUM2INT(ry),
|
20
|
+
width = NUM2INT(rwidth),
|
21
|
+
height = NUM2INT(rheight);
|
22
|
+
IMAGE img;
|
23
|
+
read_tiff_image(tif, &img);
|
24
|
+
int bytes_per_line = check_legal_image_size(img.get_xsize(),
|
25
|
+
img.get_ysize(), img.get_bpp());
|
26
|
+
|
27
|
+
TessBaseAPI::InitWithLanguage(NULL, NULL, lang, NULL, false, 0, NULL);
|
28
|
+
char *text = TessBaseAPI::TesseractRect(img.get_buffer(), img.get_bpp()/8,
|
29
|
+
bytes_per_line, x, y, width, height);
|
30
|
+
TessBaseAPI::End();
|
31
|
+
TIFFClose(tif);
|
32
|
+
|
33
|
+
return rb_str_new2(text);
|
34
|
+
}
|
35
|
+
|
36
|
+
void
|
37
|
+
Init_simple_tesseract_ext() {
|
38
|
+
rb_cTesseract = rb_define_class("Tesseract", rb_cObject);
|
39
|
+
rb_define_private_method(rb_cTesseract, "get_text", (VALUE (*)(...))rb_cTesseract_get_text, 6);
|
40
|
+
}
|
41
|
+
}
|
data/lib/tesseract.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'RMagick'
|
2
|
+
require 'simple_tesseract_ext'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
class Tesseract
|
6
|
+
attr_reader :src, :blob
|
7
|
+
attr_accessor :lang, :editor
|
8
|
+
|
9
|
+
def initialize (opts={})
|
10
|
+
@lang = opts.delete(:lang) || opts.delete(:language) || 'eng'
|
11
|
+
self.src = opts.delete(:src) || opts.delete(:source) || opts.delete(:image)
|
12
|
+
self.blob = opts.delete(:blob)
|
13
|
+
@editor = opts.delete(:editor) || lambda {|x|x}
|
14
|
+
self.strip = opts.delete(:strip)
|
15
|
+
@tmp = Tempfile.new(['rbtesseract', '.tiff']).tap {|x| x.close }.path
|
16
|
+
|
17
|
+
ObjectSpace.define_finalizer(self, method(:finalize))
|
18
|
+
end
|
19
|
+
|
20
|
+
def src= (file)
|
21
|
+
@blob = nil if file
|
22
|
+
@src = file
|
23
|
+
end
|
24
|
+
|
25
|
+
def blob= (string)
|
26
|
+
@src = nil if string
|
27
|
+
@blob = string
|
28
|
+
end
|
29
|
+
|
30
|
+
def strip= (bool)
|
31
|
+
@strip = !!bool
|
32
|
+
end
|
33
|
+
|
34
|
+
def strip?
|
35
|
+
@strip
|
36
|
+
end
|
37
|
+
|
38
|
+
alias language lang
|
39
|
+
alias language= lang=
|
40
|
+
alias source src
|
41
|
+
alias source= src=
|
42
|
+
alias image src
|
43
|
+
alias image= src=
|
44
|
+
|
45
|
+
def solve (x=0, y=0, width=nil, height=nil)
|
46
|
+
editor.call((@src ? Magick::Image.read(@src) : Magick::Image.from_blob(@blob)).first).write(@tmp)
|
47
|
+
img = Magick::Image.read(@tmp).first
|
48
|
+
x ||= 0
|
49
|
+
y ||= 0
|
50
|
+
width ||= img.columns
|
51
|
+
height ||= img.rows
|
52
|
+
|
53
|
+
get_text(@lang, @tmp, x, y, width, height).tap {|x|
|
54
|
+
x.strip! if strip?
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def crops (*areas)
|
59
|
+
areas.map {|area|
|
60
|
+
solve(*area)
|
61
|
+
}.join
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s
|
65
|
+
solve
|
66
|
+
end
|
67
|
+
|
68
|
+
def finalize
|
69
|
+
File.unlink(@tmp) rescue nil
|
70
|
+
end
|
71
|
+
alias close finalize
|
72
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simple_tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- shura
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-09-06 00:00:00 +02:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rmagick
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
description: tesseract ruby bindings
|
34
|
+
email: shura1991@gmail.com
|
35
|
+
executables: []
|
36
|
+
|
37
|
+
extensions:
|
38
|
+
- ext/extconf.rb
|
39
|
+
extra_rdoc_files: []
|
40
|
+
|
41
|
+
files:
|
42
|
+
- lib/tesseract.rb
|
43
|
+
- ext/simple_tesseract_ext.cpp
|
44
|
+
- ext/extconf.rb
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/shurizzle/simple_tesseract
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project:
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: tesseract ruby bindings
|
77
|
+
test_files: []
|
78
|
+
|