consistent_company 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +36 -0
- data/LICENSE.txt +20 -0
- data/README.md +97 -0
- data/Rakefile +48 -0
- data/VERSION +1 -0
- data/consistent_company.gemspec +70 -0
- data/ext/consistent_company/consistent_company.c +383 -0
- data/ext/consistent_company/extconf.rb +6 -0
- data/lib/consistent_company/consistent_company.bundle +0 -0
- data/lib/consistent_company/version.rb +10 -0
- data/lib/consistent_company.rb +5 -0
- data/test/helper.rb +19 -0
- data/test/test_consistent_company.rb +73 -0
- metadata +133 -0
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "shoulda", ">= 0"
|
10
|
+
gem "bundler", "~> 1.0.18"
|
11
|
+
gem "jeweler", "~> 1.6.4"
|
12
|
+
gem "rcov", ">= 0"
|
13
|
+
gem 'pry'
|
14
|
+
gem "rake-compiler", "~> 0.7.6"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
coderay (0.9.8)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.6.4)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
method_source (0.6.5)
|
11
|
+
ruby_parser (>= 2.0.5)
|
12
|
+
pry (0.9.5)
|
13
|
+
coderay (>= 0.9.8)
|
14
|
+
method_source (>= 0.6.5)
|
15
|
+
ruby_parser (>= 2.0.5)
|
16
|
+
slop (~> 2.1.0)
|
17
|
+
rake (0.9.2)
|
18
|
+
rake-compiler (0.7.6)
|
19
|
+
rake
|
20
|
+
rcov (0.9.10)
|
21
|
+
ruby_parser (2.3.0)
|
22
|
+
sexp_processor (~> 3.0)
|
23
|
+
sexp_processor (3.0.6)
|
24
|
+
shoulda (2.11.3)
|
25
|
+
slop (2.1.0)
|
26
|
+
|
27
|
+
PLATFORMS
|
28
|
+
ruby
|
29
|
+
|
30
|
+
DEPENDENCIES
|
31
|
+
bundler (~> 1.0.18)
|
32
|
+
jeweler (~> 1.6.4)
|
33
|
+
pry
|
34
|
+
rake-compiler (~> 0.7.6)
|
35
|
+
rcov
|
36
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Doug Cleven
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# consistent_company
|
2
|
+
|
3
|
+
A Ruby C Extension that takes a Company Name string and replaces it with a normalized compressed version.
|
4
|
+
The new name is not meant for human consumption, but for comparing other like names to find matches.
|
5
|
+
This was written with the Adverting domain in mind. So many of the common advertiser/agency abbreviations are
|
6
|
+
handled well. Although it can work with other domains you may find need to enhance the specific company name
|
7
|
+
identifiers in IsCompanyName and TransformCompany.
|
8
|
+
|
9
|
+
## Install:
|
10
|
+
gem install consistent_company
|
11
|
+
|
12
|
+
## Usage:
|
13
|
+
A company_namer method is added to the string class.
|
14
|
+
|
15
|
+
>> require 'consistent_company'
|
16
|
+
=> true
|
17
|
+
>> 'First Federal Savings'.company_namer
|
18
|
+
=> '1STFEDERALSAVINGS'
|
19
|
+
|
20
|
+
## Development:
|
21
|
+
|
22
|
+
|
23
|
+
## CompanyNamer Function Transformations
|
24
|
+
This function performs a sequence of changes to its input parameter, which should be the name of an advertiser or agency.
|
25
|
+
|
26
|
+
1. It transforms all alpha characters to upper case.
|
27
|
+
|
28
|
+
2. It loops through the string, finding and tracking open and close parenthesis characters.
|
29
|
+
If it finds any of the following patterns, the parentheses and the text contained within them is removed
|
30
|
+
and replaced with a single space.
|
31
|
+
The “..” strings indicate other characters not within the parentheses; the “xx” characters represent any characters within the parentheses.
|
32
|
+
All other patterns of parenthesis characters are left intact.
|
33
|
+
..(xx).. ..(xx ..(xx)..(xx).. ..(xx(xx)xx).. ..(xx)..(xx ..(xx(xx)xx ..(xx(xx
|
34
|
+
|
35
|
+
3. It loops through the string performing character-level transformations:
|
36
|
+
|
37
|
+
a. Characters in the ranges A-Z, 0-9, and high-order characters (ASCII value 128 or greater) are kept unchanged.
|
38
|
+
|
39
|
+
b. Single tic-marks (‘) are removed
|
40
|
+
|
41
|
+
c. An ampersand character (&) in the middle of the name is converted to “AND”, with surrounding spaces as they were
|
42
|
+
|
43
|
+
d. A plus sign (+) is converted to “PLUS” if it is preceded only by “A” (taking account of spacing); otherwise it is converted to “AND”
|
44
|
+
|
45
|
+
e. All other characters are converted to spaces
|
46
|
+
|
47
|
+
f. All the above transformations ensure that there are no multiple spaces in the result
|
48
|
+
|
49
|
+
4. It replaces all instances of “ AND ” (the word “and” surrounded by spaces) with “ & ”, and removes leading and trailing spaces.
|
50
|
+
|
51
|
+
5. It passes the string through TransformCompany and then returns the result.
|
52
|
+
|
53
|
+
## TransformCompany Function Transformations
|
54
|
+
This function transforms elements of the company name at the word level,
|
55
|
+
removing some and making variants into a common form.
|
56
|
+
It also changes commonly used long words into shorter versions, to speed the hashing process.
|
57
|
+
|
58
|
+
1. It removes the word “THE”
|
59
|
+
|
60
|
+
2. It transforms the first eleven number words into numeric form: “THREE” becomes “3”
|
61
|
+
|
62
|
+
3. It transforms the first ten ordinals into numeric form: “SIXTH” becomes “6TH”
|
63
|
+
|
64
|
+
4. It transforms common words to a common abbreviation. "CENTER" becomes CTR
|
65
|
+
|
66
|
+
5. Leading and trailing spaces are removed
|
67
|
+
|
68
|
+
6. If the string has more than one word and more than three characters:
|
69
|
+
|
70
|
+
a. If the leading word is “A” and the second word is neither an initial or “PLUS”, remove the leading “A”.Example: “A B & X” keeps its “A”, as does “A PLUS”, but “A TOUCH OF CLASS” does not.
|
71
|
+
|
72
|
+
b. If the last word in the name is a “company word” as defined by IsCompanyWord, then remove it and repeat the check and removal on the resulting last word.
|
73
|
+
|
74
|
+
c. If the last character in the name is now “&”, remove it
|
75
|
+
|
76
|
+
7. Remove all spaces from the name and return .
|
77
|
+
|
78
|
+
## IsCompanyWord Function
|
79
|
+
The following words are recognized as commonly-used in company names without adding to the distinguishing characteristics of the name.
|
80
|
+
|
81
|
+
ADV, ADVERTISER, ADV, ADVERTISING, AGCY, AGENCY, AGY, ASC, ASS, ASSN, ASSOC, ASSOCIAT, ASSOCIATES, ASSOCIATION, ATTORNEY, ATTRNY, ATTY, ATY, AUTO, CO, COMP, COMPANIES, COMPANY, CORP, CORPORATION, CT, CONTRA, DEPARTMENT, DEPT, DIR, DIRECT, DIV, DIVISION, GROUP, HOLDINGS, INC, INCORPORATED, INT, LIMITED, LLC, LLP, LOCAL, LTD, PC, PLC, PROD, PRODS, PRODUCT, PRODUCTIONS, PRODUCTS, TR, TRADE
|
82
|
+
|
83
|
+
== Contributing to consistent_company
|
84
|
+
|
85
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
86
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
87
|
+
* Fork the project
|
88
|
+
* Start a feature/bugfix branch
|
89
|
+
* Commit and push until you are happy with your contribution
|
90
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
91
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
92
|
+
|
93
|
+
== Copyright
|
94
|
+
|
95
|
+
Copyright (c) 2011 Doug Cleven. See LICENSE.txt for
|
96
|
+
further details.
|
97
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
|
6
|
+
begin
|
7
|
+
Bundler.setup(:default, :development)
|
8
|
+
rescue Bundler::BundlerError => e
|
9
|
+
$stderr.puts e.message
|
10
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
11
|
+
exit e.status_code
|
12
|
+
end
|
13
|
+
require 'rake'
|
14
|
+
|
15
|
+
require "rake/extensiontask"
|
16
|
+
Rake::ExtensionTask.new("consistent_company") do |extension|
|
17
|
+
extension.lib_dir = "lib/consistent_company"
|
18
|
+
end
|
19
|
+
|
20
|
+
task :build => [:clean, :compile]
|
21
|
+
|
22
|
+
require 'jeweler'
|
23
|
+
require './lib/consistent_company/version.rb'
|
24
|
+
Jeweler::Tasks.new do |gem|
|
25
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
26
|
+
gem.name = "consistent_company"
|
27
|
+
gem.homepage = "http://github.com/dcleven/consistent_company"
|
28
|
+
gem.license = "MIT"
|
29
|
+
gem.summary = %Q{Normalize a company name for consistent matching}
|
30
|
+
gem.description = %Q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
|
31
|
+
gem.email = "dcleven@marketron.com"
|
32
|
+
gem.authors = ["Doug Cleven"]
|
33
|
+
gem.version = ConsistentCompany::Version::STRING
|
34
|
+
gem.files.include('lib/consistent_company/*') # add native stuff
|
35
|
+
# dependencies defined in Gemfile
|
36
|
+
end
|
37
|
+
Jeweler::RubygemsDotOrgTasks.new
|
38
|
+
|
39
|
+
task :test => [:build]
|
40
|
+
|
41
|
+
require 'rake/testtask'
|
42
|
+
Rake::TestTask.new(:test) do |test|
|
43
|
+
test.libs << 'lib' << 'test'
|
44
|
+
test.pattern = 'test/**/test_*.rb'
|
45
|
+
test.verbose = true
|
46
|
+
end
|
47
|
+
|
48
|
+
task :default => :test
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{consistent_company}
|
8
|
+
s.version = "0.0.3"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = [%q{Doug Cleven}]
|
12
|
+
s.date = %q{2011-09-20}
|
13
|
+
s.description = %q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
|
14
|
+
s.email = %q{dcleven@marketron.com}
|
15
|
+
s.extensions = [%q{ext/consistent_company/extconf.rb}]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.md"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
"Gemfile",
|
23
|
+
"Gemfile.lock",
|
24
|
+
"LICENSE.txt",
|
25
|
+
"README.md",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"consistent_company.gemspec",
|
29
|
+
"ext/consistent_company/consistent_company.c",
|
30
|
+
"ext/consistent_company/extconf.rb",
|
31
|
+
"lib/consistent_company.rb",
|
32
|
+
"lib/consistent_company/consistent_company.bundle",
|
33
|
+
"lib/consistent_company/version.rb",
|
34
|
+
"test/helper.rb",
|
35
|
+
"test/test_consistent_company.rb"
|
36
|
+
]
|
37
|
+
s.homepage = %q{http://github.com/dcleven/consistent_company}
|
38
|
+
s.licenses = [%q{MIT}]
|
39
|
+
s.require_paths = [%q{lib}]
|
40
|
+
s.rubygems_version = %q{1.8.6}
|
41
|
+
s.summary = %q{Normalize a company name for consistent matching}
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
47
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.18"])
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
50
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
51
|
+
s.add_development_dependency(%q<pry>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<rake-compiler>, ["~> 0.7.6"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
55
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.18"])
|
56
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
57
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
58
|
+
s.add_dependency(%q<pry>, [">= 0"])
|
59
|
+
s.add_dependency(%q<rake-compiler>, ["~> 0.7.6"])
|
60
|
+
end
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.18"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
+
s.add_dependency(%q<pry>, [">= 0"])
|
67
|
+
s.add_dependency(%q<rake-compiler>, ["~> 0.7.6"])
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
@@ -0,0 +1,383 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <ctype.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
|
7
|
+
char * TransformCompany(char * inString);
|
8
|
+
static int IsCompanyWord(char * inWord);
|
9
|
+
char *trimwhitespace(char *str);
|
10
|
+
char *str_replace(char *orig, const char *rep, const char *with);
|
11
|
+
|
12
|
+
static VALUE rb_ConsistentCompany_Init(VALUE self)
|
13
|
+
{
|
14
|
+
return self;
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
static VALUE rb_CompanyNamer(VALUE self)
|
19
|
+
{
|
20
|
+
char * pSelf = RSTRING_PTR(self);
|
21
|
+
|
22
|
+
// for company only
|
23
|
+
int i;
|
24
|
+
char ch;
|
25
|
+
int asc;
|
26
|
+
int numLefts = 0, numRights = 0;
|
27
|
+
int left1 = -1, right1 = -1, left2 = -1, right2 = -1;
|
28
|
+
char * workString = malloc(strlen(pSelf)+2); // 2 extra chars for TransformCompany
|
29
|
+
char * returnString = malloc(strlen(pSelf)+2);
|
30
|
+
char * inString;
|
31
|
+
strcpy(workString, pSelf);
|
32
|
+
inString = workString;
|
33
|
+
|
34
|
+
for( i = 0; inString[i]; i++)
|
35
|
+
inString[i] = toupper( inString[i] );
|
36
|
+
|
37
|
+
inString = trimwhitespace(inString);
|
38
|
+
unsigned long len = strlen(inString);
|
39
|
+
for (i = 0; i < len; i++)
|
40
|
+
{
|
41
|
+
if (inString[i] == '(')
|
42
|
+
{
|
43
|
+
numLefts++;
|
44
|
+
if (numLefts == 1)
|
45
|
+
left1 = i;
|
46
|
+
else if (numLefts == 2)
|
47
|
+
left2 = i;
|
48
|
+
}
|
49
|
+
else if (inString[i] == ')')
|
50
|
+
{
|
51
|
+
numRights++;
|
52
|
+
if (numRights == 1)
|
53
|
+
right1 = i;
|
54
|
+
else if (numRights == 2)
|
55
|
+
right2 = i;
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
if (numLefts == 0 || inString[0] == '(')
|
60
|
+
{ } // Do Nothing
|
61
|
+
else if (numLefts == 1)
|
62
|
+
{
|
63
|
+
if (right1 > left1)
|
64
|
+
{
|
65
|
+
// ..(xx)..
|
66
|
+
inString[left1++] = ' ';
|
67
|
+
strcpy(&inString[left1], &inString[right1+1]);
|
68
|
+
}
|
69
|
+
else
|
70
|
+
// ..(xx
|
71
|
+
inString[left1] = '\0';
|
72
|
+
}
|
73
|
+
else if (numLefts == 2)
|
74
|
+
{
|
75
|
+
if ((left1 < right1) && (right1 < left2) && (left2 < right2))
|
76
|
+
{
|
77
|
+
// ..(xx)..(xx)..
|
78
|
+
inString[left1] = ' ';
|
79
|
+
strncpy(inString + left1 + 1, inString + right1 + 1, left2-right1-1);
|
80
|
+
inString[left1+1+left2-right1-1] = ' ';
|
81
|
+
strcpy(inString+left1+1+left2-right1, inString + right2+1);
|
82
|
+
}
|
83
|
+
else if ((left1 < left2) && (left2 < right1) && (right1 < right2))
|
84
|
+
{
|
85
|
+
// ..(xx(xx)xx)..
|
86
|
+
inString[left1] = ' ';
|
87
|
+
strcpy(inString+left1+1, inString+right2+1);
|
88
|
+
}
|
89
|
+
else if ((left1 < right1) && (right1 < left2) && (right2 == -1))
|
90
|
+
{
|
91
|
+
// ..(xx)..(xx
|
92
|
+
inString[left1] = ' ';
|
93
|
+
strncpy(inString+left1+1, inString+right1+1, left2-right1-1);
|
94
|
+
inString[left1+1+left2-right1] = '\0';
|
95
|
+
}
|
96
|
+
else if ((left1 < left2) && (left2 < right1) && (right2 == -1))
|
97
|
+
{
|
98
|
+
// ..(xx(xx)xx
|
99
|
+
inString[left1] = '\0';
|
100
|
+
}
|
101
|
+
else if ((right1 == -1) && (right2 == -1))
|
102
|
+
{
|
103
|
+
// ..(xx(xx
|
104
|
+
inString[left1] = '\0';
|
105
|
+
}
|
106
|
+
}
|
107
|
+
char singleCharStr[2];
|
108
|
+
singleCharStr[1] = '\0';
|
109
|
+
returnString[0] = '\0';
|
110
|
+
for (i = 0; i < strlen(inString); i++)
|
111
|
+
{
|
112
|
+
ch = inString[i];
|
113
|
+
asc = (int)ch;
|
114
|
+
|
115
|
+
if ((asc >= 65 && asc <= 90) ||
|
116
|
+
(asc >= 48 && asc <= 57) ||
|
117
|
+
(asc >= 128)) // A-Z, 0-9, and high order chars
|
118
|
+
{
|
119
|
+
singleCharStr[0] = ch;
|
120
|
+
strcat(returnString, singleCharStr);
|
121
|
+
}
|
122
|
+
else if (asc == 39) // '
|
123
|
+
{ } // Remove it
|
124
|
+
else if (asc == 38 && strlen(returnString) > 0) // &
|
125
|
+
{
|
126
|
+
if (returnString[strlen(returnString)-1] != ' ')
|
127
|
+
strcat(returnString, " AND ");
|
128
|
+
else
|
129
|
+
strcat(returnString, "AND ");
|
130
|
+
}
|
131
|
+
else if (asc == 43) // +
|
132
|
+
{
|
133
|
+
if (strcmp(returnString, "A") == 0 || strcmp(returnString, "A ") == 0)
|
134
|
+
strcpy(returnString, "A PLUS ");
|
135
|
+
else if (strlen(returnString) > 0)
|
136
|
+
{
|
137
|
+
if (returnString[strlen(returnString)-1] != ' ')
|
138
|
+
strcat(returnString, " AND ");
|
139
|
+
else
|
140
|
+
strcat(returnString, "AND ");
|
141
|
+
}
|
142
|
+
}
|
143
|
+
else if (strlen(returnString) > 0 &&
|
144
|
+
returnString[strlen(returnString)-1] != ' ')
|
145
|
+
{
|
146
|
+
strcat(returnString, " ");
|
147
|
+
}
|
148
|
+
}
|
149
|
+
char * p;
|
150
|
+
str_replace(returnString, " AND ", " & ");
|
151
|
+
returnString = trimwhitespace(returnString);
|
152
|
+
strcpy(returnString, TransformCompany(returnString));
|
153
|
+
VALUE return_value = rb_str_new2(trimwhitespace(returnString));
|
154
|
+
free(returnString);
|
155
|
+
free(workString);
|
156
|
+
return return_value;
|
157
|
+
}
|
158
|
+
|
159
|
+
/*
|
160
|
+
TransformCompany
|
161
|
+
given a string transform typical company name parts to common abbreviations
|
162
|
+
thereby normailizing the name and making exact matching of different names easier
|
163
|
+
example:
|
164
|
+
FIRST FEDERAL SAVINGS becomes 1ST FEDERAL SAVINGS
|
165
|
+
*/
|
166
|
+
char * TransformCompany(char * resultString)
|
167
|
+
{
|
168
|
+
// resultString should have been allocated with 2 extra char for our padding here
|
169
|
+
char * buf = malloc(strlen(resultString));
|
170
|
+
strcpy(buf, " ");
|
171
|
+
strcat(buf,resultString);
|
172
|
+
strcat(buf, " ");
|
173
|
+
strcpy(resultString, buf);
|
174
|
+
free(buf);
|
175
|
+
|
176
|
+
char * spaceLoc;
|
177
|
+
char * s = resultString;
|
178
|
+
|
179
|
+
str_replace(s, " THE ", " ");
|
180
|
+
str_replace(s, " ONE ", " 1 ");
|
181
|
+
str_replace(s, " TWO ", " 2 ");
|
182
|
+
str_replace(s, " TO ", " 2 ");
|
183
|
+
str_replace(s, " THREE ", " 3 ");
|
184
|
+
str_replace(s, " FOUR ", " 4 ");
|
185
|
+
str_replace(s, " FOR ", " 4 ");
|
186
|
+
str_replace(s, " FIVE ", " 5 ");
|
187
|
+
str_replace(s, " SIX ", " 6 ");
|
188
|
+
str_replace(s, " SEVEN ", " 7 ");
|
189
|
+
str_replace(s, " EIGHT ", " 8 ");
|
190
|
+
str_replace(s, " NINE ", " 9 ");
|
191
|
+
str_replace(s, " TEN ", " 10 ");
|
192
|
+
str_replace(s, " ELEVEN ", " 11 ");
|
193
|
+
|
194
|
+
str_replace(s, " FIRST ", " 1ST ");
|
195
|
+
str_replace(s, " SECOND ", " 2ND ");
|
196
|
+
str_replace(s, " THIRD ", " 3RD ");
|
197
|
+
str_replace(s, " FOURTH ", " 4TH ");
|
198
|
+
str_replace(s, " FIFTH ", " 5TH ");
|
199
|
+
str_replace(s, " SIXTH ", " 6TH ");
|
200
|
+
str_replace(s, " SEVENTH ", " 7TH ");
|
201
|
+
str_replace(s, " EIGHTH ", " 8TH ");
|
202
|
+
str_replace(s, " NINTH ", " 9TH ");
|
203
|
+
str_replace(s, " TENTH ", " 10TH ");
|
204
|
+
str_replace(s, " CENTRE ", " CTR ");
|
205
|
+
str_replace(s, " CENTER ", " CTR ");
|
206
|
+
str_replace(s, " AUTOMOTIVE ", " AUTO ");
|
207
|
+
str_replace(s, " AUTOMOBILE ", " AUTO ");
|
208
|
+
str_replace(s, " AUTOS ", " AUTO ");
|
209
|
+
str_replace(s, " AVENUE ", " AVE ");
|
210
|
+
str_replace(s, " DRIVE ", " DR ");
|
211
|
+
str_replace(s, " PHOTOGRAPHY ", " PHOTO ");
|
212
|
+
str_replace(s, " BROTHERS ", " BROS ");
|
213
|
+
str_replace(s, " TECHNOLOGY ", " TEC ");
|
214
|
+
str_replace(s, " TECH ", " TEC ");
|
215
|
+
str_replace(s, " TELEVISION ", " TV ");
|
216
|
+
str_replace(s, " INFORMATION ", " INFO ");
|
217
|
+
str_replace(s, " SOCIETY ", " SOC ");
|
218
|
+
str_replace(s, " DEPARTMENT ", " DEPT ");
|
219
|
+
str_replace(s, " REGIONAL ", " REG ");
|
220
|
+
str_replace(s, " REGION ", " REG ");
|
221
|
+
str_replace(s, " AUTHORITY ", " AUTH ");
|
222
|
+
str_replace(s, " NATIONAL ", " NATL ");
|
223
|
+
str_replace(s, " INTERNATIONAL ", " INT ");
|
224
|
+
str_replace(s, " INTERNATION ", " INT ");
|
225
|
+
str_replace(s, " INTL ", " INT ");
|
226
|
+
str_replace(s, " MARKETING ", " MKTG ");
|
227
|
+
str_replace(s, " MANAGEMENT ", " MGT ");
|
228
|
+
str_replace(s, " MGMT ", " MGT ");
|
229
|
+
|
230
|
+
s = trimwhitespace(s);
|
231
|
+
spaceLoc = strstr(s, " ");
|
232
|
+
//spaceLoc = resultString.IndexOf(" ");
|
233
|
+
if (spaceLoc && strlen(s) > 3) // More than one word and more than 3 chars
|
234
|
+
{
|
235
|
+
// Check for "A" as the first word, and
|
236
|
+
// make sure that second word is not an initital or the word "PLUS"
|
237
|
+
// For example: "A C & R" do not remove "A"; "A TOUCH OF CLASS" remove the "A"
|
238
|
+
if (strncmp(s, "A ", 2) == 0 &&
|
239
|
+
strncmp(s+2, "&", 1) != 0 &&
|
240
|
+
strncmp(s+3, " ", 1) != 0 &&
|
241
|
+
strstr(s, "PLUS") != s + 2)
|
242
|
+
{
|
243
|
+
strcpy(s, s+2);
|
244
|
+
}
|
245
|
+
|
246
|
+
spaceLoc = strrchr(s, ' ');
|
247
|
+
//spaceLoc = resultString.LastIndexOf(" ");
|
248
|
+
if (spaceLoc) // Look at the last word
|
249
|
+
{
|
250
|
+
char * lastWord = malloc(strlen(spaceLoc));
|
251
|
+
strcpy(lastWord, spaceLoc + 1);
|
252
|
+
if (IsCompanyWord(lastWord))
|
253
|
+
{
|
254
|
+
*spaceLoc = '\0';
|
255
|
+
spaceLoc = strrchr(s, ' ');
|
256
|
+
if (spaceLoc) // Look at the new last word
|
257
|
+
{
|
258
|
+
strcpy(lastWord, spaceLoc + 1);
|
259
|
+
if (IsCompanyWord(lastWord))
|
260
|
+
{
|
261
|
+
*spaceLoc = '\0';
|
262
|
+
}
|
263
|
+
}
|
264
|
+
}
|
265
|
+
free(lastWord);
|
266
|
+
}
|
267
|
+
if (s[strlen(s)-1] == '&')
|
268
|
+
s[strlen(s)-1] = '\0';
|
269
|
+
}
|
270
|
+
|
271
|
+
str_replace(s, " ", "");
|
272
|
+
return s;
|
273
|
+
}
|
274
|
+
|
275
|
+
/*
|
276
|
+
IsCompanyWord
|
277
|
+
returns 1 if the null terminated word passed in inWord
|
278
|
+
is a typical Company word. Add more company words here if desired.
|
279
|
+
return 0 if not a Company word
|
280
|
+
*/
|
281
|
+
int IsCompanyWord(char * inWord)
|
282
|
+
{
|
283
|
+
if (strcmp(inWord, "ADV") == 0 ||
|
284
|
+
strcmp(inWord, "ADVERTISER") == 0 ||
|
285
|
+
strcmp(inWord, "ADV") == 0 ||
|
286
|
+
strcmp(inWord, "ADVERTISING") == 0 ||
|
287
|
+
strcmp(inWord, "AGCY") == 0 ||
|
288
|
+
strcmp(inWord, "AGENCY") == 0 ||
|
289
|
+
strcmp(inWord, "AGY") == 0 ||
|
290
|
+
strcmp(inWord, "ASC") == 0 ||
|
291
|
+
strcmp(inWord, "ASS") == 0 ||
|
292
|
+
strcmp(inWord, "ASSN") == 0 ||
|
293
|
+
strcmp(inWord, "ASSOC") == 0 ||
|
294
|
+
strcmp(inWord, "ASSOCIAT") == 0 ||
|
295
|
+
strcmp(inWord, "ASSOCIATES") == 0 ||
|
296
|
+
strcmp(inWord, "ASSOCIATION") == 0 ||
|
297
|
+
strcmp(inWord, "ATTORNEY") == 0 ||
|
298
|
+
strcmp(inWord, "ATTRNY") == 0 ||
|
299
|
+
strcmp(inWord, "ATTY") == 0 ||
|
300
|
+
strcmp(inWord, "ATY") == 0 ||
|
301
|
+
strcmp(inWord, "AUTO") == 0 ||
|
302
|
+
strcmp(inWord, "CO") == 0 ||
|
303
|
+
strcmp(inWord, "COMP") == 0 ||
|
304
|
+
strcmp(inWord, "COMPANIES") == 0 ||
|
305
|
+
strcmp(inWord, "COMPANY") == 0 ||
|
306
|
+
strcmp(inWord, "CORP") == 0 ||
|
307
|
+
strcmp(inWord, "CORPORATION") == 0 ||
|
308
|
+
strcmp(inWord, "CT") == 0 ||
|
309
|
+
strcmp(inWord, "CONTRA") == 0 ||
|
310
|
+
strcmp(inWord, "DEPARTMENT") == 0 ||
|
311
|
+
strcmp(inWord, "DEPT") == 0 ||
|
312
|
+
strcmp(inWord, "DIR") == 0 ||
|
313
|
+
strcmp(inWord, "DIRECT") == 0 ||
|
314
|
+
strcmp(inWord, "DIV") == 0 ||
|
315
|
+
strcmp(inWord, "DIVISION") == 0 ||
|
316
|
+
strcmp(inWord, "GROUP") == 0 ||
|
317
|
+
strcmp(inWord, "HOLDINGS") == 0 ||
|
318
|
+
strcmp(inWord, "INC") == 0 ||
|
319
|
+
strcmp(inWord, "INCORPORATED") == 0 ||
|
320
|
+
strcmp(inWord, "INT") == 0 ||
|
321
|
+
strcmp(inWord, "LIMITED") == 0 ||
|
322
|
+
strcmp(inWord, "LLC") == 0 ||
|
323
|
+
strcmp(inWord, "LLP") == 0 ||
|
324
|
+
strcmp(inWord, "LOCAL") == 0 ||
|
325
|
+
strcmp(inWord, "LTD") == 0 ||
|
326
|
+
strcmp(inWord, "PC") == 0 ||
|
327
|
+
strcmp(inWord, "PLC") == 0 ||
|
328
|
+
strcmp(inWord, "PROD") == 0 ||
|
329
|
+
strcmp(inWord, "PRODS") == 0 ||
|
330
|
+
strcmp(inWord, "PRODUCT") == 0 ||
|
331
|
+
strcmp(inWord, "PRODUCTIONS") == 0 ||
|
332
|
+
strcmp(inWord, "PRODUCTS") == 0 ||
|
333
|
+
strcmp(inWord, "TR") == 0 ||
|
334
|
+
strcmp(inWord, "TRADE") == 0)
|
335
|
+
return 1;
|
336
|
+
else
|
337
|
+
return 0;
|
338
|
+
} // IsCompanyWord
|
339
|
+
|
340
|
+
/*
|
341
|
+
Trim whitespace from front and back of string
|
342
|
+
*/
|
343
|
+
char *trimwhitespace(char *str)
|
344
|
+
{
|
345
|
+
char *end;
|
346
|
+
|
347
|
+
// Trim leading space
|
348
|
+
while(isspace(*str)) str++;
|
349
|
+
|
350
|
+
if(*str == 0) // All spaces?
|
351
|
+
return str;
|
352
|
+
|
353
|
+
// Trim trailing space
|
354
|
+
end = str + strlen(str) - 1;
|
355
|
+
while(end > str && isspace(*end)) end--;
|
356
|
+
|
357
|
+
// Write new null terminator
|
358
|
+
*(end+1) = 0;
|
359
|
+
|
360
|
+
return str;
|
361
|
+
}
|
362
|
+
|
363
|
+
// !!!! This ONLY works where rep is longer than with
|
364
|
+
char *str_replace(char *orig, const char *rep, const char *with)
|
365
|
+
{
|
366
|
+
char * s = orig;
|
367
|
+
while (s=strstr(s, rep))
|
368
|
+
{
|
369
|
+
strncpy(s, with, strlen(with));
|
370
|
+
strcpy(s+strlen(with), s+strlen(rep));
|
371
|
+
s = s + strlen(with)-1;
|
372
|
+
}
|
373
|
+
return orig;
|
374
|
+
}
|
375
|
+
|
376
|
+
|
377
|
+
void Init_consistent_company()
|
378
|
+
{
|
379
|
+
VALUE rb_mConsistentCompany = rb_define_module("ConsistentCompany");
|
380
|
+
VALUE string = rb_define_class("String", rb_cObject);
|
381
|
+
rb_define_method(rb_mConsistentCompany, "company_namer", rb_CompanyNamer, 0);
|
382
|
+
rb_include_module(string, rb_mConsistentCompany);
|
383
|
+
}
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
#require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib/consistent_company'))
|
14
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'ext/consistent_company'))
|
15
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
16
|
+
|
17
|
+
|
18
|
+
class Test::Unit::TestCase
|
19
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'consistent_company'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
class TestConsistentCompany < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_company_namer
|
8
|
+
# don't change the calling string
|
9
|
+
str = ' my test '
|
10
|
+
company = str.company_namer
|
11
|
+
assert_equal(' my test ', str)
|
12
|
+
# remove leading and trailing space
|
13
|
+
assert_equal('TEST', " test ".company_namer)
|
14
|
+
# remove embedded space
|
15
|
+
assert_equal('TEST', " te st ".company_namer)
|
16
|
+
# remove Company
|
17
|
+
assert_equal("MYTEST", "My Test Company".company_namer)
|
18
|
+
assert_equal("MYCOMPANYTEST", ("MY COMPANY TEST").company_namer)
|
19
|
+
assert_equal("MYTEST", ("MY TEST COMPANY COMP").company_namer)
|
20
|
+
# remove leading The
|
21
|
+
assert_equal("AAA", "The AAA Company".company_namer)
|
22
|
+
# remove punctuation
|
23
|
+
assert_equal("TESTERS", %q{The, ?%^* tester's company!}.company_namer)
|
24
|
+
# empty name
|
25
|
+
assert_equal("", "".company_namer)
|
26
|
+
# a very long name
|
27
|
+
assert_equal("A"*1000, ("A"*1000).company_namer)
|
28
|
+
# parenthesis matching
|
29
|
+
assert_equal("BBEE", ("BB(xx)EE").company_namer)
|
30
|
+
assert_equal("BE", ("B(xx)E").company_namer)
|
31
|
+
assert_equal("XX", ("(xx)").company_namer)
|
32
|
+
assert_equal("BB", ("BB(xx").company_namer)
|
33
|
+
assert_equal("XX", ("(xx").company_namer)
|
34
|
+
assert_equal("BBMMEE", ("BB(xx)MM(xx)EE").company_namer)
|
35
|
+
assert_equal("BBEE", ("BB(xx(xx)xx)EE").company_namer)
|
36
|
+
assert_equal("BBMM", ("BB(xx)MM(xx").company_namer)
|
37
|
+
assert_equal("BB", ("BB(xx(xx)xx").company_namer)
|
38
|
+
assert_equal("BB", ("BB(xx(xx").company_namer)
|
39
|
+
|
40
|
+
# handle and &
|
41
|
+
assert_equal("PRE&POST", ("pre and post").company_namer)
|
42
|
+
assert_equal("PRE&POST", ("pre & post").company_namer)
|
43
|
+
assert_equal("PRE&POST", ("&pre and post&").company_namer)
|
44
|
+
assert_equal("PRE&POST", ("& pre and post &").company_namer)
|
45
|
+
assert_equal("ANDPRE&POSTAND", ("and pre and post and").company_namer)
|
46
|
+
|
47
|
+
# leading A
|
48
|
+
assert_equal("ABTEST", ("A B TEST").company_namer)
|
49
|
+
assert_equal("BTEST", ("A BTEST").company_namer)
|
50
|
+
assert_equal("APLUSTEST", ("A PLUS TEST").company_namer)
|
51
|
+
assert_equal("APLUSTEST", ("A + TEST").company_namer)
|
52
|
+
assert_equal("APLUSTEST", ("A+ TEST").company_namer)
|
53
|
+
|
54
|
+
# common name shortening
|
55
|
+
assert_equal("TESTCTRCTRCTR", ("Test Center Center Center").company_namer)
|
56
|
+
|
57
|
+
assert_equal("My Test Advertising Co".company_namer, "MY TEST ADV COMPANY".company_namer)
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
# def test_benchmark
|
62
|
+
# looptimes = 1000000
|
63
|
+
#
|
64
|
+
# puts "[BaseString]"
|
65
|
+
# puts Benchmark::CAPTION
|
66
|
+
# puts Benchmark.measure {
|
67
|
+
# base = BaseString.new
|
68
|
+
# looptimes.times { |n|
|
69
|
+
# base.make " My Test Company Name (A)"
|
70
|
+
# }
|
71
|
+
# }
|
72
|
+
# end
|
73
|
+
end
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: consistent_company
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Doug Cleven
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-09-20 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: shoulda
|
16
|
+
requirement: &70285303244580 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70285303244580
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &70285303243640 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.0.18
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70285303243640
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &70285303242660 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.6.4
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70285303242660
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rcov
|
49
|
+
requirement: &70285303241720 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70285303241720
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: pry
|
60
|
+
requirement: &70285303241040 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70285303241040
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake-compiler
|
71
|
+
requirement: &70285303240300 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 0.7.6
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *70285303240300
|
80
|
+
description: Ruby C Extension to normalize a company name. Useful when company names
|
81
|
+
come from various sources.
|
82
|
+
email: dcleven@marketron.com
|
83
|
+
executables: []
|
84
|
+
extensions:
|
85
|
+
- ext/consistent_company/extconf.rb
|
86
|
+
extra_rdoc_files:
|
87
|
+
- LICENSE.txt
|
88
|
+
- README.md
|
89
|
+
files:
|
90
|
+
- .document
|
91
|
+
- Gemfile
|
92
|
+
- Gemfile.lock
|
93
|
+
- LICENSE.txt
|
94
|
+
- README.md
|
95
|
+
- Rakefile
|
96
|
+
- VERSION
|
97
|
+
- consistent_company.gemspec
|
98
|
+
- ext/consistent_company/consistent_company.c
|
99
|
+
- ext/consistent_company/extconf.rb
|
100
|
+
- lib/consistent_company.rb
|
101
|
+
- lib/consistent_company/consistent_company.bundle
|
102
|
+
- lib/consistent_company/version.rb
|
103
|
+
- test/helper.rb
|
104
|
+
- test/test_consistent_company.rb
|
105
|
+
homepage: http://github.com/dcleven/consistent_company
|
106
|
+
licenses:
|
107
|
+
- MIT
|
108
|
+
post_install_message:
|
109
|
+
rdoc_options: []
|
110
|
+
require_paths:
|
111
|
+
- lib
|
112
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
segments:
|
119
|
+
- 0
|
120
|
+
hash: -1402416125086284082
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
requirements: []
|
128
|
+
rubyforge_project:
|
129
|
+
rubygems_version: 1.8.6
|
130
|
+
signing_key:
|
131
|
+
specification_version: 3
|
132
|
+
summary: Normalize a company name for consistent matching
|
133
|
+
test_files: []
|