xtractor 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/README.md +19 -2
- data/lib/xtractor.rb +1 -0
- data/lib/xtractor/request.rb +46 -0
- data/lib/xtractor/version.rb +1 -1
- data/lib/xtractor/xtract.rb +69 -46
- data/xtractor.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e360ba9ddac0bd8b589bbfa0cdda7c344408b029
|
4
|
+
data.tar.gz: 9ffe93ad99b50e40d8fc018b71dc2cdb78bd4273
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec2936bc8117b54e570d30cbb871555d0853da8e5e5a27778e84274df085d03c5c079293ed30e21de86f8c5e71367bde511d20cf148c9f7ce714ff376d84175f
|
7
|
+
data.tar.gz: 3f917c1372895e6a3db1cd6ecfea2a56afe141299c07a2a362a89f7036d6346a65c83b243275be4062f1779e1f5ec11e4efcd7d33d6cfc4b1b5d9b7aaaa5b6ec
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Xtractor
|
2
|
+
<img src="https://badge.fury.io/rb/xtractor.svg" alt="Gem Version" /> <img src="https://travis-ci.org/Kamalpaneru/Xtractor.svg?branch=master" alt="Build" />
|
2
3
|
|
3
|
-
|
4
|
+
|
5
|
+
Xtractor was developed as a need to the problem of inserting data from an excelsheet image to excel.And it does the same as described.
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -20,7 +22,22 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
## Usage
|
22
24
|
|
23
|
-
Used to split cells from excel sheet images and extracts data.
|
25
|
+
Used to split cells from excel sheet images and extracts data. <br>
|
26
|
+
NOTE: I've replaced Tesseract with Azure Computer Vision API(Not perfect but a significant improvement though).<br> <br>
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
require 'xtractor'
|
30
|
+
|
31
|
+
Xtractor::Execute.new('Image_Filename')
|
32
|
+
|
33
|
+
```
|
34
|
+
## Sample Image
|
35
|
+
|
36
|
+

|
37
|
+
|
38
|
+
## Generate API key
|
39
|
+
Replace API_KEY in ```lib/xtractor/request.rb ``` with your Key.<br>
|
40
|
+
```https://azure.microsoft.com/en-gb/try/cognitive-services/ ```
|
24
41
|
|
25
42
|
## Contributing
|
26
43
|
|
data/lib/xtractor.rb
CHANGED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class Azure_API
|
5
|
+
|
6
|
+
def request_API
|
7
|
+
uri = URI('https://westcentralus.api.cognitive.microsoft.com/vision/v1.0/ocr')
|
8
|
+
uri.query = URI.encode_www_form({
|
9
|
+
|
10
|
+
'language' => 'en',
|
11
|
+
'detectOrientation ' => 'true'
|
12
|
+
})
|
13
|
+
|
14
|
+
request = Net::HTTP::Post.new(uri.request_uri)
|
15
|
+
|
16
|
+
request['Ocp-Apim-Subscription-Key'] = "d0154961a66b4a0aa41d6f4fbb4b2105"
|
17
|
+
|
18
|
+
request['Content-Type'] = 'application/octet-stream'
|
19
|
+
|
20
|
+
collect = Hash.new
|
21
|
+
|
22
|
+
all_files = Dir.glob("cell-files/*.{jpg,png,gif}").sort
|
23
|
+
|
24
|
+
all_files.each do |crop_image|
|
25
|
+
request.body = File.binread("#{crop_image}")
|
26
|
+
|
27
|
+
response = Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https') do |http|
|
28
|
+
http.request(request)
|
29
|
+
end
|
30
|
+
|
31
|
+
collect[crop_image[0..-5]]= JSON.parse(response.body)
|
32
|
+
|
33
|
+
initial_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",0,"text").to_s
|
34
|
+
mid_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",1,"text").to_s
|
35
|
+
final_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",2,"text").to_s
|
36
|
+
|
37
|
+
collect[crop_image[0..-5]] = initial_res + ' ' + mid_res + ' ' + final_res
|
38
|
+
|
39
|
+
puts collect[crop_image[0..-5]]
|
40
|
+
|
41
|
+
File.open("cell-files/#{crop_image[11..-5]}.txt", "w") do |data|
|
42
|
+
data.write(collect[crop_image[0..-5]])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/xtractor/version.rb
CHANGED
data/lib/xtractor/xtract.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
require "rubygems"
|
2
2
|
require "rmagick"
|
3
|
+
require_relative "request"
|
3
4
|
|
4
5
|
module Xtractor
|
5
6
|
class Execute
|
6
7
|
|
7
8
|
def initialize(image)
|
8
9
|
img = Magick::Image::read(image).first
|
10
|
+
|
9
11
|
if %w(TIFF).include? img.format
|
10
12
|
crop_throw(img)
|
11
13
|
else
|
@@ -16,76 +18,97 @@ module Xtractor
|
|
16
18
|
end
|
17
19
|
|
18
20
|
def crop_throw(img)
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
img = img.resize_to_fit(2500,906)
|
22
|
+
box = img.bounding_box
|
23
|
+
img.crop!(box.x, box.y, box.width, box.height)
|
22
24
|
start(img)
|
23
25
|
end
|
24
26
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
27
|
+
def store_line_rows(img)
|
28
|
+
(0...img.rows).inject([]) do |arr, line_index|
|
29
|
+
threshold = (img.columns*0.10).floor
|
30
|
+
arr << line_index if img.get_pixels(0, line_index, (threshold), 1).select{|pixel|
|
31
|
+
pixel.red < 63000 }.length >= threshold*0.95
|
32
|
+
arr
|
31
33
|
end
|
34
|
+
end
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
columns_filter = store_line_columns[1..-1].inject( [[ (store_line_columns[0]),(store_line_columns[0]) ]]) do |arr, line|
|
44
|
-
if line == arr.last[1]+1
|
45
|
-
arr.last[1] = line
|
46
|
-
else
|
47
|
-
arr << [line,line]
|
48
|
-
end
|
49
|
-
arr
|
36
|
+
def store_line_columns(img)
|
37
|
+
(0...img.columns).inject([])do |arr, line_index|
|
38
|
+
threshold = (img.rows*0.10).floor
|
39
|
+
arr << line_index if img.get_pixels(line_index, 0, 1, (threshold)).select{|pixel|
|
40
|
+
pixel.red < 63000 }.length >= threshold*0.95
|
41
|
+
arr
|
50
42
|
end
|
43
|
+
end
|
51
44
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
45
|
+
def columns_filter(img)
|
46
|
+
store_line_columns(img)[1..-1].inject( [[ (store_line_columns(img)[0]),(store_line_columns(img)[0]) ]]) do |arr, line|
|
47
|
+
if line == arr.last[1]+1
|
48
|
+
arr.last[1] = line
|
49
|
+
else
|
50
|
+
arr << [line,line]
|
51
|
+
end
|
52
|
+
arr
|
58
53
|
end
|
59
|
-
|
54
|
+
end
|
55
|
+
|
56
|
+
def rows_filter(img)
|
57
|
+
store_line_rows(img)[1..-1].inject( [[ (store_line_rows(img)[0]), (store_line_rows(img)[0] )]]) do |arr, line|
|
58
|
+
if line == arr.last[1]+1
|
59
|
+
arr.last[1] = line
|
60
|
+
else
|
61
|
+
arr << [line,line]
|
62
|
+
end
|
63
|
+
arr
|
60
64
|
end
|
65
|
+
end
|
61
66
|
|
62
67
|
|
63
|
-
Dir.mkdir('cell-files') if !File.exists?('cell-files')
|
64
68
|
|
65
|
-
|
69
|
+
def start(img)
|
70
|
+
Dir.mkdir('cell-files') if !File.exist?('cell-files')
|
66
71
|
|
67
|
-
rows_filter[0..-2].each_with_index do |row, i|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
w,h= columns_filter[j+1][0]-x, rows_filter[i+1][0]-y
|
72
|
+
rows_filter(img)[0..-2].each_with_index do |row, i|
|
73
|
+
columns_filter(img)[0..-2].each_with_index do |column, j|
|
74
|
+
x,y= column[1], row[1]
|
75
|
+
w,h= columns_filter(img)[j+1][0]-x, rows_filter(img)[i+1][0]-y
|
72
76
|
|
73
|
-
|
74
|
-
[pixel.red, pixel.green, pixel.blue]}.flatten).write("cell-files/#{j}x#{i}.
|
77
|
+
Magick::Image.constitute(w, h, "RGB", img.get_pixels(x,y,w,h).map{ |pixel|
|
78
|
+
[pixel.red, pixel.green, pixel.blue]}.flatten).write("cell-files/#{j}x#{i}.jpg") do |out|
|
75
79
|
out.depth=8
|
76
80
|
end
|
77
81
|
|
78
|
-
|
79
|
-
|
82
|
+
r_image = Magick::Image::read("cell-files/#{j}x#{i}.jpg").first
|
83
|
+
res_image = r_image.resize(r_image.columns,100)
|
80
84
|
|
81
|
-
|
85
|
+
res_image.write("cell-files/#{j}x#{i}.jpg") do
|
86
|
+
self.quality = 100
|
87
|
+
end
|
82
88
|
|
89
|
+
end
|
83
90
|
end
|
91
|
+
collect_hash(img)
|
92
|
+
end
|
93
|
+
|
94
|
+
def collect_hash(img)
|
95
|
+
api = Azure_API.new
|
96
|
+
api.request_API
|
97
|
+
out_final(img)
|
98
|
+
end
|
84
99
|
|
85
|
-
|
100
|
+
def out_final(img)
|
101
|
+
output_file = File.open('table.tsv', 'w')
|
102
|
+
rows_filter(img)[0..-2].each_with_index do |_row, i|
|
103
|
+
text_row = []
|
104
|
+
columns_filter(img)[0..-2].each_with_index do |_column, j|
|
105
|
+
text_row << File.open("cell-files/#{j}x#{i}.txt", 'r').readlines.map{|line| line.strip}.join(" ")
|
106
|
+
end
|
107
|
+
output_file.puts( text_row.join("\t"))
|
86
108
|
end
|
87
109
|
output_file.close
|
88
110
|
end
|
89
111
|
|
90
112
|
end
|
91
113
|
end
|
114
|
+
|
data/xtractor.gemspec
CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
11
11
|
spec.files = ["lib/xtract.rb"]
|
12
12
|
|
13
13
|
spec.summary = %q{Splits cells in an excelsheet images and extracts data.}
|
14
|
-
spec.description = %q{Xtractor was developed as a need to my own problem of inserting handwritten data from an excelsheet
|
14
|
+
spec.description = %q{Xtractor was developed as a need to my own problem of inserting handwritten data from an excelsheet image to excel.And it does the same as described. }
|
15
15
|
spec.homepage = 'https://rubygems.org/gems/xtractor'
|
16
16
|
spec.license = "MIT"
|
17
17
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xtractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- kamalpaneru
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 2.16.0
|
69
69
|
description: 'Xtractor was developed as a need to my own problem of inserting handwritten
|
70
|
-
data from an excelsheet
|
70
|
+
data from an excelsheet image to excel.And it does the same as described. '
|
71
71
|
email:
|
72
72
|
- kamalpaneru.15@gmail.com
|
73
73
|
executables: []
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- bin/console
|
86
86
|
- bin/setup
|
87
87
|
- lib/xtractor.rb
|
88
|
+
- lib/xtractor/request.rb
|
88
89
|
- lib/xtractor/version.rb
|
89
90
|
- lib/xtractor/xtract.rb
|
90
91
|
- xtractor.gemspec
|
@@ -109,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
110
|
version: '0'
|
110
111
|
requirements: []
|
111
112
|
rubyforge_project:
|
112
|
-
rubygems_version: 2.
|
113
|
+
rubygems_version: 2.6.14
|
113
114
|
signing_key:
|
114
115
|
specification_version: 4
|
115
116
|
summary: Splits cells in an excelsheet images and extracts data.
|