consistent_company 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
data/README.md
CHANGED
@@ -6,6 +6,8 @@ This was written with the Adverting domain in mind. So many of the common advert
|
|
6
6
|
handled well. Although it can work with other domains you may find need to enhance the specific company name
|
7
7
|
identifiers in IsCompanyName and TransformCompany.
|
8
8
|
|
9
|
+
Tested with over 600,000 names from 100's of sources where an overlap of names is expected with a match rate of 40%.
|
10
|
+
|
9
11
|
## Install:
|
10
12
|
gem install consistent_company
|
11
13
|
|
data/consistent_company.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{consistent_company}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Doug Cleven}]
|
12
|
-
s.date = %q{2011-09-
|
12
|
+
s.date = %q{2011-09-30}
|
13
13
|
s.description = %q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
|
14
14
|
s.email = %q{dcleven@marketron.com}
|
15
15
|
s.extensions = [%q{ext/consistent_company/extconf.rb}]
|
@@ -7,6 +7,7 @@
|
|
7
7
|
char * TransformCompany(char * inString);
|
8
8
|
static int IsCompanyWord(char * inWord);
|
9
9
|
char *trimwhitespace(char *str);
|
10
|
+
char *trimsuffix(char *str, const char *suffix);
|
10
11
|
char *str_replace(char *orig, const char *rep, const char *with);
|
11
12
|
|
12
13
|
static VALUE rb_ConsistentCompany_Init(VALUE self)
|
@@ -18,6 +19,20 @@ static VALUE rb_ConsistentCompany_Init(VALUE self)
|
|
18
19
|
static VALUE rb_CompanyNamer(VALUE self)
|
19
20
|
{
|
20
21
|
char * pSelf = RSTRING_PTR(self);
|
22
|
+
int selfLen = strlen(pSelf)+2;
|
23
|
+
int workLen = selfLen;
|
24
|
+
char * s = pSelf;
|
25
|
+
|
26
|
+
// calc size of work strings
|
27
|
+
// while processing we turn & = AND, + = PLUS
|
28
|
+
// and we add space at front and back
|
29
|
+
while (s = strpbrk(s, "&+"))
|
30
|
+
{
|
31
|
+
workLen +=3; // worst case we add 3 chars
|
32
|
+
s++;
|
33
|
+
}
|
34
|
+
workLen += 2; // add space front and back
|
35
|
+
//////////////
|
21
36
|
|
22
37
|
// for company only
|
23
38
|
int i;
|
@@ -25,8 +40,8 @@ static VALUE rb_CompanyNamer(VALUE self)
|
|
25
40
|
int asc;
|
26
41
|
int numLefts = 0, numRights = 0;
|
27
42
|
int left1 = -1, right1 = -1, left2 = -1, right2 = -1;
|
28
|
-
char * workString = malloc(
|
29
|
-
char * returnString = malloc(
|
43
|
+
char * workString = malloc(workLen); // 2 extra chars for TransformCompany
|
44
|
+
char * returnString = malloc(workLen);
|
30
45
|
char * inString;
|
31
46
|
strcpy(workString, pSelf);
|
32
47
|
inString = workString;
|
@@ -146,8 +161,17 @@ static VALUE rb_CompanyNamer(VALUE self)
|
|
146
161
|
strcat(returnString, " ");
|
147
162
|
}
|
148
163
|
}
|
164
|
+
// if (strlen(workString) > workLen || strlen(returnString) > workLen)
|
165
|
+
// {
|
166
|
+
// char buff[200];
|
167
|
+
// sprintf(buff, "workLen %d %s workString %d returnString %d %s", workLen, pSelf, strlen(workString), strlen(returnString), returnString);
|
168
|
+
// return rb_str_new2(trimwhitespace(buff));
|
169
|
+
// }
|
149
170
|
char * p;
|
150
171
|
str_replace(returnString, " AND ", " & ");
|
172
|
+
|
173
|
+
int oldLen = strlen(returnString);
|
174
|
+
// returnString = trimsuffix(returnString, "s");
|
151
175
|
returnString = trimwhitespace(returnString);
|
152
176
|
strcpy(returnString, TransformCompany(returnString));
|
153
177
|
VALUE return_value = rb_str_new2(trimwhitespace(returnString));
|
@@ -166,7 +190,7 @@ FIRST FEDERAL SAVINGS becomes 1ST FEDERAL SAVINGS
|
|
166
190
|
char * TransformCompany(char * resultString)
|
167
191
|
{
|
168
192
|
// resultString should have been allocated with 2 extra char for our padding here
|
169
|
-
char * buf = malloc(strlen(resultString));
|
193
|
+
char * buf = malloc(strlen(resultString)+3);
|
170
194
|
strcpy(buf, " ");
|
171
195
|
strcat(buf,resultString);
|
172
196
|
strcat(buf, " ");
|
@@ -203,6 +227,10 @@ char * TransformCompany(char * resultString)
|
|
203
227
|
str_replace(s, " TENTH ", " 10TH ");
|
204
228
|
str_replace(s, " CENTRE ", " CTR ");
|
205
229
|
str_replace(s, " CENTER ", " CTR ");
|
230
|
+
str_replace(s, " CNTR ", " CTR ");
|
231
|
+
str_replace(s, " CTR ", " CTR ");
|
232
|
+
str_replace(s, " CENT ", " CTR ");
|
233
|
+
str_replace(s, " CENTR ", " CTR ");
|
206
234
|
str_replace(s, " AUTOMOTIVE ", " AUTO ");
|
207
235
|
str_replace(s, " AUTOMOBILE ", " AUTO ");
|
208
236
|
str_replace(s, " AUTOS ", " AUTO ");
|
@@ -247,7 +275,7 @@ char * TransformCompany(char * resultString)
|
|
247
275
|
//spaceLoc = resultString.LastIndexOf(" ");
|
248
276
|
if (spaceLoc) // Look at the last word
|
249
277
|
{
|
250
|
-
char * lastWord = malloc(strlen(spaceLoc));
|
278
|
+
char * lastWord = malloc(strlen(spaceLoc)+1);
|
251
279
|
strcpy(lastWord, spaceLoc + 1);
|
252
280
|
if (IsCompanyWord(lastWord))
|
253
281
|
{
|
@@ -360,6 +388,44 @@ char *trimwhitespace(char *str)
|
|
360
388
|
return str;
|
361
389
|
}
|
362
390
|
|
391
|
+
char *trimsuffix(char *str, const char *suffix)
|
392
|
+
{
|
393
|
+
char delims[] = " ";
|
394
|
+
char *result = NULL;
|
395
|
+
char *workString = malloc(strlen(str)+3);
|
396
|
+
char *workBuffer = malloc(strlen(str)+3);
|
397
|
+
strcpy(workString, str);
|
398
|
+
str[0] = '\0';
|
399
|
+
result = strtok(workString, delims);
|
400
|
+
while(result != NULL)
|
401
|
+
{
|
402
|
+
strcpy(workBuffer, result);
|
403
|
+
int len = strlen(workBuffer);
|
404
|
+
if (len > 3)
|
405
|
+
{
|
406
|
+
if (workBuffer[len-1] == 'S')
|
407
|
+
{
|
408
|
+
char * p = strstr(workBuffer, "IES");
|
409
|
+
if (p && p[3] == '\0' && strcmp(workBuffer, "SERIES") != 0)
|
410
|
+
{
|
411
|
+
*p = 'Y';
|
412
|
+
*++p = '\0';
|
413
|
+
}
|
414
|
+
if (strcmp(workBuffer, "PLUS") != 0)
|
415
|
+
workBuffer[len-1] = '\0';
|
416
|
+
}
|
417
|
+
}
|
418
|
+
strcat(str, workBuffer);
|
419
|
+
result = strtok( NULL, delims );
|
420
|
+
if (result)
|
421
|
+
strcat(str, " ");
|
422
|
+
}
|
423
|
+
free(workString);
|
424
|
+
free(workBuffer);
|
425
|
+
return str;
|
426
|
+
}
|
427
|
+
|
428
|
+
|
363
429
|
// !!!! This ONLY works where rep is longer than with
|
364
430
|
char *str_replace(char *orig, const char *rep, const char *with)
|
365
431
|
{
|
Binary file
|
@@ -24,7 +24,7 @@ class TestConsistentCompany < Test::Unit::TestCase
|
|
24
24
|
# empty name
|
25
25
|
assert_equal("", "".company_namer)
|
26
26
|
# a very long name
|
27
|
-
assert_equal("A"*1000, ("A"*1000).company_namer)
|
27
|
+
assert_equal("A"*1000+"NAMEISHERE", (" A"*1000 + 'NAME IS HERE ').company_namer)
|
28
28
|
# parenthesis matching
|
29
29
|
assert_equal("BBEE", ("BB(xx)EE").company_namer)
|
30
30
|
assert_equal("BE", ("B(xx)E").company_namer)
|
@@ -54,6 +54,10 @@ class TestConsistentCompany < Test::Unit::TestCase
|
|
54
54
|
# common name shortening
|
55
55
|
assert_equal("TESTCTRCTRCTR", ("Test Center Center Center").company_namer)
|
56
56
|
|
57
|
+
# #singularize
|
58
|
+
# assert_equal("TESTNAMEHERE", (" Test Names here").company_namer)
|
59
|
+
# assert_equal("TESTBATTERY", ("Test Batteries").company_namer)
|
60
|
+
|
57
61
|
assert_equal("My Test Advertising Co".company_namer, "MY TEST ADV COMPANY".company_namer)
|
58
62
|
end
|
59
63
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: consistent_company
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-30 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &70187182156920 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70187182156920
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bundler
|
27
|
-
requirement: &
|
27
|
+
requirement: &70187182156220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.0.18
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70187182156220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70187182155600 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.6.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70187182155600
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rcov
|
49
|
-
requirement: &
|
49
|
+
requirement: &70187182154960 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70187182154960
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: pry
|
60
|
-
requirement: &
|
60
|
+
requirement: &70187182154360 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70187182154360
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake-compiler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70187182153680 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 0.7.6
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70187182153680
|
80
80
|
description: Ruby C Extension to normalize a company name. Useful when company names
|
81
81
|
come from various sources.
|
82
82
|
email: dcleven@marketron.com
|
@@ -117,7 +117,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
117
117
|
version: '0'
|
118
118
|
segments:
|
119
119
|
- 0
|
120
|
-
hash:
|
120
|
+
hash: 3110059090780389810
|
121
121
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
122
|
none: false
|
123
123
|
requirements:
|