ocr_challenge 1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 53334886a5c1796fb3838f8eb11809185340b6a8
4
+ data.tar.gz: 94e10225727f7d9415f0f00695758d32d534a994
5
+ SHA512:
6
+ metadata.gz: 807ce07f9ea34cb158d63705570ff4bece9c96d833c241940947f7540a78a09fdb8fa1ec12fc00bea3dc96a5dd9437411aa7d1b3ce3ab6ae5adcb6d8dd44ddc7
7
+ data.tar.gz: 2c710c70ab22749f553827c8dd79bf29578532af3a2d0bad0a271db379b2778ac28cd81962c91e471d64876ae2bcaecb4b66bf61dfeff5b1de05d0b782602f69
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .idea
2
+ coverage/
3
+ *.gem
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ ocr_challenge (1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.2.5)
10
+ docile (1.1.5)
11
+ faker (1.4.3)
12
+ i18n (~> 0.5)
13
+ i18n (0.6.11)
14
+ multi_json (1.10.1)
15
+ rspec (2.99.0)
16
+ rspec-core (~> 2.99.0)
17
+ rspec-expectations (~> 2.99.0)
18
+ rspec-mocks (~> 2.99.0)
19
+ rspec-core (2.99.2)
20
+ rspec-expectations (2.99.2)
21
+ diff-lcs (>= 1.1.3, < 2.0)
22
+ rspec-mocks (2.99.2)
23
+ simplecov (0.9.1)
24
+ docile (~> 1.1.0)
25
+ multi_json (~> 1.0)
26
+ simplecov-html (~> 0.8.0)
27
+ simplecov-html (0.8.0)
28
+
29
+ PLATFORMS
30
+ ruby
31
+
32
+ DEPENDENCIES
33
+ faker (~> 1.4)
34
+ ocr_challenge!
35
+ rspec (~> 2.99)
36
+ simplecov (~> 0.9)
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Alexander Vanadio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ ## Business Card OCR Challenge - [defined here](http://www.asymmetrik.com/programming-challenges/business-card-ocr.html)
2
+
3
+ ### Overview
4
+ This gem allows you to take a new line delimited String, and pull out names, email addresses, and phone numbers. The String input can come from a file, STDIN, the output of another program, etc.
5
+
6
+ This concept has been specifically applied to business cards, but can easily be used more generically.
7
+
8
+ ### Installation
9
+
10
+ ```shell
11
+ gem install ocr_challenge
12
+ ```
13
+ or in your Gemfile
14
+
15
+ ```ruby
16
+ gem 'ocr_challenge'
17
+ ```
18
+
19
+ ### Usage
20
+
21
+ 1.) Create a contact with your input String
22
+
23
+ ```ruby
24
+ require 'ocr_challenge'
25
+ include OcrChallenge
26
+
27
+ text = """
28
+ Alexander Vanadio\n
29
+ execdd17@gmail.com\n
30
+ (123)-456-7890\n
31
+ """
32
+
33
+ contact = IBusinessCardParser.get_contact_info(text)
34
+ ```
35
+
36
+ 2.) Get the information through your contact instance
37
+
38
+ ```ruby
39
+ contact.get_name # => "Name: Alexander Vanadio"
40
+ contact.get_email_address # => "Email: execdd17@gmail.com"
41
+ contact.get_phone_number # => "Phone: 123-456-7890"
42
+
43
+ contact.to_s # => "Name: Alexander Vanadio\nEmail: execdd17@gmail.com\nPhone: 123-456-7890"
44
+ ```
45
+
46
+ ### Advanced Usage
47
+
48
+ Let's use a more complicated String input. The IBusinessCard parser will attempt to find all email addresses and phone numbers, but not fax numbers. Once it does, you can get them directly from your IContactInfo instance.
49
+
50
+ ```ruby
51
+ require 'ocr_challenge'
52
+ include OcrChallenge
53
+
54
+ text = """
55
+ Alexander Vanadio\n
56
+ Software Engineer
57
+ My Company Name
58
+ execdd17@gmail.com\n
59
+ anotherEmail@gmail.com\n
60
+ Phone: 1-(123)-456-7890\n
61
+ Cell: 123.444.7890\n
62
+ Fax: 892-234-5467
63
+ """
64
+
65
+ contact = IBusinessCardParser.get_contact_info(text)
66
+
67
+ contact.to_s # => "Name: Alexander Vanadio\nEmail: anotherEmail@gmail.com\nEmail: execdd17@gmail.com\nPhone Number: 123-444-7890\nPhone Number: 123-456-7890\n"
68
+
69
+ # you can also access the names, email_addresses, and phone_numbers directly
70
+ contact.names
71
+ contact.email_addresses
72
+ contact.phone_numbers
73
+ ```
74
+
75
+ ### Running Tests and Code Coverage
76
+
77
+ ```bash
78
+ cd orc_challenge
79
+ rspec
80
+ firefox coverage/index.html
81
+ ```
@@ -0,0 +1,26 @@
1
+ # This module attempts to parse phone numbers which I w ill define as
2
+ # BOTH land lines and cell phone numbers. It will attempt to filter out fax numbers
3
+ # NOTE: This assumes the including class will have a 'lines' instance variable
4
+ module OcrChallenge::BasicLandAndCellNumberParser
5
+ include OcrChallenge::BasicTenDigitTelecomParser
6
+
7
+ FAX_REGEX = /fax/i
8
+
9
+ def parse_phone_numbers
10
+ matching_lines = get_matches
11
+
12
+ phone_number_lines = matching_lines.reject do |line|
13
+ is_fax_number?(line)
14
+ end
15
+
16
+ phone_number_lines.map do |line|
17
+ format(line, '-')
18
+ end.sort
19
+ end
20
+
21
+ private
22
+
23
+ def is_fax_number?(line)
24
+ line =~ FAX_REGEX
25
+ end
26
+ end
@@ -0,0 +1,27 @@
1
+ # This will find numbers that are grouped in 3 chunks of consecutive numbers,
2
+ # with chunk sizes of 3, 3, and 4 respectively, regardless of what is in
3
+ # between the chunks. It will also accept a leading 1.
4
+ # An example is 1-(234) 435-3567
5
+ module OcrChallenge::BasicTenDigitTelecomParser
6
+
7
+ BASIC_NUMBER_REGEX = /([1][^\d]*)?[\d]{3}[^\d]*[\d]{3}[^\d]*[\d]{4}/
8
+
9
+ def get_matches
10
+ lines.select do |line|
11
+ line =~ BASIC_NUMBER_REGEX
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ # remove every character that isn't a number, and leading 1 if present
18
+ def scrub_line(line)
19
+ line.gsub!(/[^\d]/, '')
20
+ line.size == 11 ? line[(1..-1)] : line
21
+ end
22
+
23
+ def format(line, delimiter)
24
+ line = scrub_line(line)
25
+ line[(0..2)] + delimiter + line[(3..5)] + delimiter + line[(6..9)]
26
+ end
27
+ end
@@ -0,0 +1,18 @@
1
+ module OcrChallenge::EmailParser
2
+
3
+ # taken from: http://www.regular-expressions.info/email.html
4
+ EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
5
+
6
+ def parse_email_addresses
7
+ lines.inject(Array.new) do |email_addresses, line|
8
+ line = line.strip
9
+
10
+ if offset = line =~ EMAIL_REGEX
11
+ email_addresses << line[(offset..-1)]
12
+ end
13
+
14
+ email_addresses
15
+ end.sort
16
+ end
17
+
18
+ end
@@ -0,0 +1,18 @@
1
+ class OcrChallenge::IBusinessCardParser
2
+ include OcrChallenge::BasicLandAndCellNumberParser
3
+ include OcrChallenge::EmailParser
4
+ include OcrChallenge::NameParser
5
+
6
+ def self.get_contact_info(document)
7
+ parser = new(document)
8
+ IContactInfo.new(parser)
9
+ end
10
+
11
+ def initialize(document)
12
+ @lines = document.split("\n").reject { |line| line.empty? }
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :lines
18
+ end
@@ -0,0 +1,45 @@
1
+ class OcrChallenge::IContactInfo
2
+
3
+ attr_reader :names, :email_addresses, :phone_numbers
4
+
5
+ def initialize(parser, list_of_names_dir='names')
6
+ @parser = parser
7
+ @names = @parser.parse_names(list_of_names_dir)
8
+ @email_addresses = @parser.parse_email_addresses
9
+ @phone_numbers = @parser.parse_phone_numbers
10
+ end
11
+
12
+ # NOTE: perhaps unlikely, but a business card may have more than one name. For
13
+ # example, maybe there were multiple points of contact for a given company card.
14
+ # Since the challenge did not specify, I take the first one.
15
+ def get_name
16
+ "Name: #{names.first}"
17
+ end
18
+
19
+ # NOTE: the programming challenge does not account for multiple email addresses,
20
+ # so I take the first one
21
+ def get_email_address
22
+ "Email: #{email_addresses.first}"
23
+ end
24
+
25
+ # NOTE: the programming challenge does not take into account that a contact
26
+ # can have multiple phone numbers, so I take the firs one
27
+ def get_phone_number
28
+ "Phone: #{phone_numbers.first}"
29
+ end
30
+
31
+ def to_s
32
+ contact_as_string = ""
33
+
34
+ {Name: names, Email: email_addresses,
35
+ "Phone Number" => phone_numbers}.each_pair do |label, values|
36
+
37
+ values.each do |value|
38
+ contact_as_string << "#{label}: #{value}\n"
39
+ end
40
+ end
41
+
42
+ contact_as_string
43
+ end
44
+
45
+ end
@@ -0,0 +1,28 @@
1
+ require 'pathname'
2
+
3
+ # It turns out that identifying names in a blob of text is hard. I decided to
4
+ # use a dictionary of names in combination with eliminating lines with digits.
5
+ module OcrChallenge::NameParser
6
+
7
+ # Note: the name files are expected to be new line separated names
8
+ def parse_names(dir_path)
9
+
10
+ #TODO: catch IO exception
11
+ names_dir = Pathname.new(dir_path)
12
+ name_files= names_dir.children
13
+
14
+ preprocessed_lines = lines.map(&:strip).reject do |line|
15
+ line =~ /\d/ # names shouldn't have digits in them
16
+ end
17
+
18
+ # compare the current line with all the names available in the name files
19
+ preprocessed_lines.select do |line|
20
+ name_files.any? do |file|
21
+ name_lines = file.readlines
22
+ name_lines.any? do |name_line|
23
+ line.downcase =~ /\b#{name_line.downcase.chomp}\b/
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ module OcrChallenge
2
+ VERSION = 1.0
3
+ end
@@ -0,0 +1,12 @@
1
+ $:.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+
3
+ # the project namespace
4
+ module OcrChallenge; end
5
+
6
+ require 'ocr_challenge/basic_ten_digit_telecom_parser'
7
+ require 'ocr_challenge/basic_land_and_cell_number_parser'
8
+ require 'ocr_challenge/name_parser'
9
+ require 'ocr_challenge/email_parser'
10
+ require 'ocr_challenge/i_contact_info'
11
+ require 'ocr_challenge/i_business_card_parser'
12
+ require 'ocr_challenge/version'