consistent_company 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md
CHANGED
@@ -6,6 +6,8 @@ This was written with the Adverting domain in mind. So many of the common advert
|
|
6
6
|
handled well. Although it can work with other domains you may find need to enhance the specific company name
|
7
7
|
identifiers in IsCompanyName and TransformCompany.
|
8
8
|
|
9
|
+
Tested with over 600,000 names from 100's of sources where an overlap of names is expected with a match rate of 40%.
|
10
|
+
|
9
11
|
## Install:
|
10
12
|
gem install consistent_company
|
11
13
|
|
data/consistent_company.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{consistent_company}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Doug Cleven}]
|
12
|
-
s.date = %q{2011-09-
|
12
|
+
s.date = %q{2011-09-30}
|
13
13
|
s.description = %q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
|
14
14
|
s.email = %q{dcleven@marketron.com}
|
15
15
|
s.extensions = [%q{ext/consistent_company/extconf.rb}]
|
@@ -7,6 +7,7 @@
|
|
7
7
|
char * TransformCompany(char * inString);
|
8
8
|
static int IsCompanyWord(char * inWord);
|
9
9
|
char *trimwhitespace(char *str);
|
10
|
+
char *trimsuffix(char *str, const char *suffix);
|
10
11
|
char *str_replace(char *orig, const char *rep, const char *with);
|
11
12
|
|
12
13
|
static VALUE rb_ConsistentCompany_Init(VALUE self)
|
@@ -18,6 +19,20 @@ static VALUE rb_ConsistentCompany_Init(VALUE self)
|
|
18
19
|
static VALUE rb_CompanyNamer(VALUE self)
|
19
20
|
{
|
20
21
|
char * pSelf = RSTRING_PTR(self);
|
22
|
+
int selfLen = strlen(pSelf)+2;
|
23
|
+
int workLen = selfLen;
|
24
|
+
char * s = pSelf;
|
25
|
+
|
26
|
+
// calc size of work strings
|
27
|
+
// while processing we turn & = AND, + = PLUS
|
28
|
+
// and we add space at front and back
|
29
|
+
while (s = strpbrk(s, "&+"))
|
30
|
+
{
|
31
|
+
workLen +=3; // worst case we add 3 chars
|
32
|
+
s++;
|
33
|
+
}
|
34
|
+
workLen += 2; // add space front and back
|
35
|
+
//////////////
|
21
36
|
|
22
37
|
// for company only
|
23
38
|
int i;
|
@@ -25,8 +40,8 @@ static VALUE rb_CompanyNamer(VALUE self)
|
|
25
40
|
int asc;
|
26
41
|
int numLefts = 0, numRights = 0;
|
27
42
|
int left1 = -1, right1 = -1, left2 = -1, right2 = -1;
|
28
|
-
char * workString = malloc(
|
29
|
-
char * returnString = malloc(
|
43
|
+
char * workString = malloc(workLen); // 2 extra chars for TransformCompany
|
44
|
+
char * returnString = malloc(workLen);
|
30
45
|
char * inString;
|
31
46
|
strcpy(workString, pSelf);
|
32
47
|
inString = workString;
|
@@ -146,8 +161,17 @@ static VALUE rb_CompanyNamer(VALUE self)
|
|
146
161
|
strcat(returnString, " ");
|
147
162
|
}
|
148
163
|
}
|
164
|
+
// if (strlen(workString) > workLen || strlen(returnString) > workLen)
|
165
|
+
// {
|
166
|
+
// char buff[200];
|
167
|
+
// sprintf(buff, "workLen %d %s workString %d returnString %d %s", workLen, pSelf, strlen(workString), strlen(returnString), returnString);
|
168
|
+
// return rb_str_new2(trimwhitespace(buff));
|
169
|
+
// }
|
149
170
|
char * p;
|
150
171
|
str_replace(returnString, " AND ", " & ");
|
172
|
+
|
173
|
+
int oldLen = strlen(returnString);
|
174
|
+
// returnString = trimsuffix(returnString, "s");
|
151
175
|
returnString = trimwhitespace(returnString);
|
152
176
|
strcpy(returnString, TransformCompany(returnString));
|
153
177
|
VALUE return_value = rb_str_new2(trimwhitespace(returnString));
|
@@ -166,7 +190,7 @@ FIRST FEDERAL SAVINGS becomes 1ST FEDERAL SAVINGS
|
|
166
190
|
char * TransformCompany(char * resultString)
|
167
191
|
{
|
168
192
|
// resultString should have been allocated with 2 extra char for our padding here
|
169
|
-
char * buf = malloc(strlen(resultString));
|
193
|
+
char * buf = malloc(strlen(resultString)+3);
|
170
194
|
strcpy(buf, " ");
|
171
195
|
strcat(buf,resultString);
|
172
196
|
strcat(buf, " ");
|
@@ -203,6 +227,10 @@ char * TransformCompany(char * resultString)
|
|
203
227
|
str_replace(s, " TENTH ", " 10TH ");
|
204
228
|
str_replace(s, " CENTRE ", " CTR ");
|
205
229
|
str_replace(s, " CENTER ", " CTR ");
|
230
|
+
str_replace(s, " CNTR ", " CTR ");
|
231
|
+
str_replace(s, " CTR ", " CTR ");
|
232
|
+
str_replace(s, " CENT ", " CTR ");
|
233
|
+
str_replace(s, " CENTR ", " CTR ");
|
206
234
|
str_replace(s, " AUTOMOTIVE ", " AUTO ");
|
207
235
|
str_replace(s, " AUTOMOBILE ", " AUTO ");
|
208
236
|
str_replace(s, " AUTOS ", " AUTO ");
|
@@ -247,7 +275,7 @@ char * TransformCompany(char * resultString)
|
|
247
275
|
//spaceLoc = resultString.LastIndexOf(" ");
|
248
276
|
if (spaceLoc) // Look at the last word
|
249
277
|
{
|
250
|
-
char * lastWord = malloc(strlen(spaceLoc));
|
278
|
+
char * lastWord = malloc(strlen(spaceLoc)+1);
|
251
279
|
strcpy(lastWord, spaceLoc + 1);
|
252
280
|
if (IsCompanyWord(lastWord))
|
253
281
|
{
|
@@ -360,6 +388,44 @@ char *trimwhitespace(char *str)
|
|
360
388
|
return str;
|
361
389
|
}
|
362
390
|
|
391
|
+
char *trimsuffix(char *str, const char *suffix)
|
392
|
+
{
|
393
|
+
char delims[] = " ";
|
394
|
+
char *result = NULL;
|
395
|
+
char *workString = malloc(strlen(str)+3);
|
396
|
+
char *workBuffer = malloc(strlen(str)+3);
|
397
|
+
strcpy(workString, str);
|
398
|
+
str[0] = '\0';
|
399
|
+
result = strtok(workString, delims);
|
400
|
+
while(result != NULL)
|
401
|
+
{
|
402
|
+
strcpy(workBuffer, result);
|
403
|
+
int len = strlen(workBuffer);
|
404
|
+
if (len > 3)
|
405
|
+
{
|
406
|
+
if (workBuffer[len-1] == 'S')
|
407
|
+
{
|
408
|
+
char * p = strstr(workBuffer, "IES");
|
409
|
+
if (p && p[3] == '\0' && strcmp(workBuffer, "SERIES") != 0)
|
410
|
+
{
|
411
|
+
*p = 'Y';
|
412
|
+
*++p = '\0';
|
413
|
+
}
|
414
|
+
if (strcmp(workBuffer, "PLUS") != 0)
|
415
|
+
workBuffer[len-1] = '\0';
|
416
|
+
}
|
417
|
+
}
|
418
|
+
strcat(str, workBuffer);
|
419
|
+
result = strtok( NULL, delims );
|
420
|
+
if (result)
|
421
|
+
strcat(str, " ");
|
422
|
+
}
|
423
|
+
free(workString);
|
424
|
+
free(workBuffer);
|
425
|
+
return str;
|
426
|
+
}
|
427
|
+
|
428
|
+
|
363
429
|
// !!!! This ONLY works where rep is longer than with
|
364
430
|
char *str_replace(char *orig, const char *rep, const char *with)
|
365
431
|
{
|
Binary file
|
@@ -24,7 +24,7 @@ class TestConsistentCompany < Test::Unit::TestCase
|
|
24
24
|
# empty name
|
25
25
|
assert_equal("", "".company_namer)
|
26
26
|
# a very long name
|
27
|
-
assert_equal("A"*1000, ("A"*1000).company_namer)
|
27
|
+
assert_equal("A"*1000+"NAMEISHERE", (" A"*1000 + 'NAME IS HERE ').company_namer)
|
28
28
|
# parenthesis matching
|
29
29
|
assert_equal("BBEE", ("BB(xx)EE").company_namer)
|
30
30
|
assert_equal("BE", ("B(xx)E").company_namer)
|
@@ -54,6 +54,10 @@ class TestConsistentCompany < Test::Unit::TestCase
|
|
54
54
|
# common name shortening
|
55
55
|
assert_equal("TESTCTRCTRCTR", ("Test Center Center Center").company_namer)
|
56
56
|
|
57
|
+
# #singularize
|
58
|
+
# assert_equal("TESTNAMEHERE", (" Test Names here").company_namer)
|
59
|
+
# assert_equal("TESTBATTERY", ("Test Batteries").company_namer)
|
60
|
+
|
57
61
|
assert_equal("My Test Advertising Co".company_namer, "MY TEST ADV COMPANY".company_namer)
|
58
62
|
end
|
59
63
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: consistent_company
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-30 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: shoulda
|
16
|
-
requirement: &
|
16
|
+
requirement: &70187182156920 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70187182156920
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bundler
|
27
|
-
requirement: &
|
27
|
+
requirement: &70187182156220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.0.18
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70187182156220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70187182155600 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.6.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70187182155600
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rcov
|
49
|
-
requirement: &
|
49
|
+
requirement: &70187182154960 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70187182154960
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: pry
|
60
|
-
requirement: &
|
60
|
+
requirement: &70187182154360 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70187182154360
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake-compiler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70187182153680 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 0.7.6
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70187182153680
|
80
80
|
description: Ruby C Extension to normalize a company name. Useful when company names
|
81
81
|
come from various sources.
|
82
82
|
email: dcleven@marketron.com
|
@@ -117,7 +117,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
117
117
|
version: '0'
|
118
118
|
segments:
|
119
119
|
- 0
|
120
|
-
hash:
|
120
|
+
hash: 3110059090780389810
|
121
121
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
122
|
none: false
|
123
123
|
requirements:
|