xtractor 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d3cd5007bf97796771406720cbb0ee70dfb202c
4
- data.tar.gz: 04c5b121c7ec50cadec23798c81343922e9389e1
3
+ metadata.gz: e360ba9ddac0bd8b589bbfa0cdda7c344408b029
4
+ data.tar.gz: 9ffe93ad99b50e40d8fc018b71dc2cdb78bd4273
5
5
  SHA512:
6
- metadata.gz: d3573e93e365cbcf8beb8d0e50bf3a57c40ab881f366b5851e4b1ff900424ce41df30522eaf4f03abd28a8eae5e222a24ec4a41203cd5cdd2dffe90b587bd532
7
- data.tar.gz: 41e3abf3c28d2bd54acae446176f84a43ba17040bacba6e9ebc646a9bd0840a293d7d2c6ec1662e041a0802a22103e1a1afe88af0b670cb1c3145c24381942f8
6
+ metadata.gz: ec2936bc8117b54e570d30cbb871555d0853da8e5e5a27778e84274df085d03c5c079293ed30e21de86f8c5e71367bde511d20cf148c9f7ce714ff376d84175f
7
+ data.tar.gz: 3f917c1372895e6a3db1cd6ecfea2a56afe141299c07a2a362a89f7036d6346a65c83b243275be4062f1779e1f5ec11e4efcd7d33d6cfc4b1b5d9b7aaaa5b6ec
data/.travis.yml CHANGED
@@ -1,4 +1,4 @@
1
- sudo: false
1
+ sudo: false
2
2
  language: ruby
3
3
  rvm:
4
4
  - 2.3.1
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # Xtractor
2
+ <img src="https://badge.fury.io/rb/xtractor.svg" alt="Gem Version" /> <img src="https://travis-ci.org/Kamalpaneru/Xtractor.svg?branch=master" alt="Build" />
2
3
 
3
- Xtractor was developed as a need to my own problem of inserting handwritten data from an excelsheet image to excel.And it does the same as described.
4
+
5
+ Xtractor was developed as a need to the problem of inserting data from an excelsheet image to excel.And it does the same as described.
4
6
 
5
7
  ## Installation
6
8
 
@@ -20,7 +22,22 @@ Or install it yourself as:
20
22
 
21
23
  ## Usage
22
24
 
23
- Used to split cells from excel sheet images and extracts data.
25
+ Used to split cells from excel sheet images and extracts data. <br>
26
+ NOTE: I've replaced Tesseract with Azure Computer Vision API(Not perfect but a significant improvement though).<br> <br>
27
+
28
+ ```ruby
29
+ require 'xtractor'
30
+
31
+ Xtractor::Execute.new('Image_Filename')
32
+
33
+ ```
34
+ ## Sample Image
35
+
36
+ ![image_f3](https://user-images.githubusercontent.com/13826932/31273813-03dde45a-aab0-11e7-942f-c77202f996d1.jpg)
37
+
38
+ ## Generate API key
39
+ Replace API_KEY in ```lib/xtractor/request.rb ``` with your Key.<br>
40
+ ```https://azure.microsoft.com/en-gb/try/cognitive-services/ ```
24
41
 
25
42
  ## Contributing
26
43
 
data/lib/xtractor.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require "xtractor/version"
2
2
  require "xtractor/xtract"
3
+ require_relative "xtractor/request"
3
4
 
4
5
 
@@ -0,0 +1,46 @@
1
+ require 'net/http'
2
+ require 'json'
3
+
4
+ class Azure_API
5
+
6
+ def request_API
7
+ uri = URI('https://westcentralus.api.cognitive.microsoft.com/vision/v1.0/ocr')
8
+ uri.query = URI.encode_www_form({
9
+
10
+ 'language' => 'en',
11
+ 'detectOrientation ' => 'true'
12
+ })
13
+
14
+ request = Net::HTTP::Post.new(uri.request_uri)
15
+
16
+ request['Ocp-Apim-Subscription-Key'] = "d0154961a66b4a0aa41d6f4fbb4b2105"
17
+
18
+ request['Content-Type'] = 'application/octet-stream'
19
+
20
+ collect = Hash.new
21
+
22
+ all_files = Dir.glob("cell-files/*.{jpg,png,gif}").sort
23
+
24
+ all_files.each do |crop_image|
25
+ request.body = File.binread("#{crop_image}")
26
+
27
+ response = Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https') do |http|
28
+ http.request(request)
29
+ end
30
+
31
+ collect[crop_image[0..-5]]= JSON.parse(response.body)
32
+
33
+ initial_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",0,"text").to_s
34
+ mid_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",1,"text").to_s
35
+ final_res = collect[crop_image[0..-5]].dig("regions",0,"lines",0,"words",2,"text").to_s
36
+
37
+ collect[crop_image[0..-5]] = initial_res + ' ' + mid_res + ' ' + final_res
38
+
39
+ puts collect[crop_image[0..-5]]
40
+
41
+ File.open("cell-files/#{crop_image[11..-5]}.txt", "w") do |data|
42
+ data.write(collect[crop_image[0..-5]])
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,3 +1,3 @@
1
1
  module Xtractor
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,11 +1,13 @@
1
1
  require "rubygems"
2
2
  require "rmagick"
3
+ require_relative "request"
3
4
 
4
5
  module Xtractor
5
6
  class Execute
6
7
 
7
8
  def initialize(image)
8
9
  img = Magick::Image::read(image).first
10
+
9
11
  if %w(TIFF).include? img.format
10
12
  crop_throw(img)
11
13
  else
@@ -16,76 +18,97 @@ module Xtractor
16
18
  end
17
19
 
18
20
  def crop_throw(img)
19
- img = img.resize_to_fit(2500,906)
20
- box = img.bounding_box
21
- img.crop!(box.x, box.y, box.width, box.height)
21
+ img = img.resize_to_fit(2500,906)
22
+ box = img.bounding_box
23
+ img.crop!(box.x, box.y, box.width, box.height)
22
24
  start(img)
23
25
  end
24
26
 
25
- def start(img)
26
- store_line_rows = (0..img.rows-1).inject([]) do |arr, line_index|
27
- threshold = (img.columns*0.10).floor
28
- arr << line_index if img.get_pixels(0, line_index, (threshold), 1).select{|pixel|
29
- pixel.red < 63000 }.length >= threshold*0.95
30
- arr
27
+ def store_line_rows(img)
28
+ (0...img.rows).inject([]) do |arr, line_index|
29
+ threshold = (img.columns*0.10).floor
30
+ arr << line_index if img.get_pixels(0, line_index, (threshold), 1).select{|pixel|
31
+ pixel.red < 63000 }.length >= threshold*0.95
32
+ arr
31
33
  end
34
+ end
32
35
 
33
-
34
- store_line_columns = (0..img.columns-1).inject([])do |arr, line_index|
35
- threshold = (img.rows*0.10).floor
36
- arr << line_index if img.get_pixels(line_index, 0, 1, (threshold)).select{|pixel|
37
- pixel.red < 63000 }.length >= threshold*0.95
38
- arr
39
- end
40
-
41
-
42
-
43
- columns_filter = store_line_columns[1..-1].inject( [[ (store_line_columns[0]),(store_line_columns[0]) ]]) do |arr, line|
44
- if line == arr.last[1]+1
45
- arr.last[1] = line
46
- else
47
- arr << [line,line]
48
- end
49
- arr
36
+ def store_line_columns(img)
37
+ (0...img.columns).inject([])do |arr, line_index|
38
+ threshold = (img.rows*0.10).floor
39
+ arr << line_index if img.get_pixels(line_index, 0, 1, (threshold)).select{|pixel|
40
+ pixel.red < 63000 }.length >= threshold*0.95
41
+ arr
50
42
  end
43
+ end
51
44
 
52
-
53
- rows_filter = store_line_rows[1..-1].inject( [[ (store_line_rows[0]), (store_line_rows[0] )]]) do |arr, line|
54
- if line == arr.last[1]+1
55
- arr.last[1] = line
56
- else
57
- arr << [line,line]
45
+ def columns_filter(img)
46
+ store_line_columns(img)[1..-1].inject( [[ (store_line_columns(img)[0]),(store_line_columns(img)[0]) ]]) do |arr, line|
47
+ if line == arr.last[1]+1
48
+ arr.last[1] = line
49
+ else
50
+ arr << [line,line]
51
+ end
52
+ arr
58
53
  end
59
- arr
54
+ end
55
+
56
+ def rows_filter(img)
57
+ store_line_rows(img)[1..-1].inject( [[ (store_line_rows(img)[0]), (store_line_rows(img)[0] )]]) do |arr, line|
58
+ if line == arr.last[1]+1
59
+ arr.last[1] = line
60
+ else
61
+ arr << [line,line]
62
+ end
63
+ arr
60
64
  end
65
+ end
61
66
 
62
67
 
63
- Dir.mkdir('cell-files') if !File.exists?('cell-files')
64
68
 
65
- output_file = File.open('table.txt', 'w')
69
+ def start(img)
70
+ Dir.mkdir('cell-files') if !File.exist?('cell-files')
66
71
 
67
- rows_filter[0..-2].each_with_index do |row, i|
68
- text_row = []
69
- columns_filter[0..-2].each_with_index do |column, j|
70
- x,y= column[1], row[1]
71
- w,h= columns_filter[j+1][0]-x, rows_filter[i+1][0]-y
72
+ rows_filter(img)[0..-2].each_with_index do |row, i|
73
+ columns_filter(img)[0..-2].each_with_index do |column, j|
74
+ x,y= column[1], row[1]
75
+ w,h= columns_filter(img)[j+1][0]-x, rows_filter(img)[i+1][0]-y
72
76
 
73
- Magick::Image.constitute(w, h, "RGB", img.get_pixels(x,y,w,h).map{ |pixel|
74
- [pixel.red, pixel.green, pixel.blue]}.flatten).write("cell-files/#{j}x#{i}.tif") do |out|
77
+ Magick::Image.constitute(w, h, "RGB", img.get_pixels(x,y,w,h).map{ |pixel|
78
+ [pixel.red, pixel.green, pixel.blue]}.flatten).write("cell-files/#{j}x#{i}.jpg") do |out|
75
79
  out.depth=8
76
80
  end
77
81
 
78
- `tesseract cell-files/#{j}x#{i}.tif cell-files/#{j}x#{i} `
79
-
82
+ r_image = Magick::Image::read("cell-files/#{j}x#{i}.jpg").first
83
+ res_image = r_image.resize(r_image.columns,100)
80
84
 
81
- text_row << File.open("cell-files/#{j}x#{i}.txt", 'r').readlines.map{|line| line.strip}.join(" ")
85
+ res_image.write("cell-files/#{j}x#{i}.jpg") do
86
+ self.quality = 100
87
+ end
82
88
 
89
+ end
83
90
  end
91
+ collect_hash(img)
92
+ end
93
+
94
+ def collect_hash(img)
95
+ api = Azure_API.new
96
+ api.request_API
97
+ out_final(img)
98
+ end
84
99
 
85
- output_file.puts( text_row.join("\t"))
100
+ def out_final(img)
101
+ output_file = File.open('table.tsv', 'w')
102
+ rows_filter(img)[0..-2].each_with_index do |_row, i|
103
+ text_row = []
104
+ columns_filter(img)[0..-2].each_with_index do |_column, j|
105
+ text_row << File.open("cell-files/#{j}x#{i}.txt", 'r').readlines.map{|line| line.strip}.join(" ")
106
+ end
107
+ output_file.puts( text_row.join("\t"))
86
108
  end
87
109
  output_file.close
88
110
  end
89
111
 
90
112
  end
91
113
  end
114
+
data/xtractor.gemspec CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.files = ["lib/xtract.rb"]
12
12
 
13
13
  spec.summary = %q{Splits cells in an excelsheet images and extracts data.}
14
- spec.description = %q{Xtractor was developed as a need to my own problem of inserting handwritten data from an excelsheet printed paper to excel.And it does the same as described. }
14
+ spec.description = %q{Xtractor was developed as a need to my own problem of inserting handwritten data from an excelsheet image to excel.And it does the same as described. }
15
15
  spec.homepage = 'https://rubygems.org/gems/xtractor'
16
16
  spec.license = "MIT"
17
17
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - kamalpaneru
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-06-10 00:00:00.000000000 Z
11
+ date: 2017-10-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: 2.16.0
69
69
  description: 'Xtractor was developed as a need to my own problem of inserting handwritten
70
- data from an excelsheet printed paper to excel.And it does the same as described. '
70
+ data from an excelsheet image to excel.And it does the same as described. '
71
71
  email:
72
72
  - kamalpaneru.15@gmail.com
73
73
  executables: []
@@ -85,6 +85,7 @@ files:
85
85
  - bin/console
86
86
  - bin/setup
87
87
  - lib/xtractor.rb
88
+ - lib/xtractor/request.rb
88
89
  - lib/xtractor/version.rb
89
90
  - lib/xtractor/xtract.rb
90
91
  - xtractor.gemspec
@@ -109,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
110
  version: '0'
110
111
  requirements: []
111
112
  rubyforge_project:
112
- rubygems_version: 2.5.1
113
+ rubygems_version: 2.6.14
113
114
  signing_key:
114
115
  specification_version: 4
115
116
  summary: Splits cells in an excelsheet images and extracts data.