consistent_company 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -6,6 +6,8 @@ This was written with the Adverting domain in mind. So many of the common advert
6
6
  handled well. Although it can work with other domains you may find need to enhance the specific company name
7
7
  identifiers in IsCompanyName and TransformCompany.
8
8
 
9
+ Tested with over 600,000 names from 100's of sources where an overlap of names is expected with a match rate of 40%.
10
+
9
11
  ## Install:
10
12
  gem install consistent_company
11
13
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{consistent_company}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = [%q{Doug Cleven}]
12
- s.date = %q{2011-09-20}
12
+ s.date = %q{2011-09-30}
13
13
  s.description = %q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
14
14
  s.email = %q{dcleven@marketron.com}
15
15
  s.extensions = [%q{ext/consistent_company/extconf.rb}]
@@ -7,6 +7,7 @@
7
7
  char * TransformCompany(char * inString);
8
8
  static int IsCompanyWord(char * inWord);
9
9
  char *trimwhitespace(char *str);
10
+ char *trimsuffix(char *str, const char *suffix);
10
11
  char *str_replace(char *orig, const char *rep, const char *with);
11
12
 
12
13
  static VALUE rb_ConsistentCompany_Init(VALUE self)
@@ -18,6 +19,20 @@ static VALUE rb_ConsistentCompany_Init(VALUE self)
18
19
  static VALUE rb_CompanyNamer(VALUE self)
19
20
  {
20
21
  char * pSelf = RSTRING_PTR(self);
22
+ int selfLen = strlen(pSelf)+2;
23
+ int workLen = selfLen;
24
+ char * s = pSelf;
25
+
26
+ // calc size of work strings
27
+ // while processing we turn & = AND, + = PLUS
28
+ // and we add space at front and back
29
+ while (s = strpbrk(s, "&+"))
30
+ {
31
+ workLen +=3; // worst case we add 3 chars
32
+ s++;
33
+ }
34
+ workLen += 2; // add space front and back
35
+ //////////////
21
36
 
22
37
  // for company only
23
38
  int i;
@@ -25,8 +40,8 @@ static VALUE rb_CompanyNamer(VALUE self)
25
40
  int asc;
26
41
  int numLefts = 0, numRights = 0;
27
42
  int left1 = -1, right1 = -1, left2 = -1, right2 = -1;
28
- char * workString = malloc(strlen(pSelf)+2); // 2 extra chars for TransformCompany
29
- char * returnString = malloc(strlen(pSelf)+2);
43
+ char * workString = malloc(workLen); // 2 extra chars for TransformCompany
44
+ char * returnString = malloc(workLen);
30
45
  char * inString;
31
46
  strcpy(workString, pSelf);
32
47
  inString = workString;
@@ -146,8 +161,17 @@ static VALUE rb_CompanyNamer(VALUE self)
146
161
  strcat(returnString, " ");
147
162
  }
148
163
  }
164
+ // if (strlen(workString) > workLen || strlen(returnString) > workLen)
165
+ // {
166
+ // char buff[200];
167
+ // sprintf(buff, "workLen %d %s workString %d returnString %d %s", workLen, pSelf, strlen(workString), strlen(returnString), returnString);
168
+ // return rb_str_new2(trimwhitespace(buff));
169
+ // }
149
170
  char * p;
150
171
  str_replace(returnString, " AND ", " & ");
172
+
173
+ int oldLen = strlen(returnString);
174
+ // returnString = trimsuffix(returnString, "s");
151
175
  returnString = trimwhitespace(returnString);
152
176
  strcpy(returnString, TransformCompany(returnString));
153
177
  VALUE return_value = rb_str_new2(trimwhitespace(returnString));
@@ -166,7 +190,7 @@ FIRST FEDERAL SAVINGS becomes 1ST FEDERAL SAVINGS
166
190
  char * TransformCompany(char * resultString)
167
191
  {
168
192
  // resultString should have been allocated with 2 extra char for our padding here
169
- char * buf = malloc(strlen(resultString));
193
+ char * buf = malloc(strlen(resultString)+3);
170
194
  strcpy(buf, " ");
171
195
  strcat(buf,resultString);
172
196
  strcat(buf, " ");
@@ -203,6 +227,10 @@ char * TransformCompany(char * resultString)
203
227
  str_replace(s, " TENTH ", " 10TH ");
204
228
  str_replace(s, " CENTRE ", " CTR ");
205
229
  str_replace(s, " CENTER ", " CTR ");
230
+ str_replace(s, " CNTR ", " CTR ");
231
+ str_replace(s, " CTR ", " CTR ");
232
+ str_replace(s, " CENT ", " CTR ");
233
+ str_replace(s, " CENTR ", " CTR ");
206
234
  str_replace(s, " AUTOMOTIVE ", " AUTO ");
207
235
  str_replace(s, " AUTOMOBILE ", " AUTO ");
208
236
  str_replace(s, " AUTOS ", " AUTO ");
@@ -247,7 +275,7 @@ char * TransformCompany(char * resultString)
247
275
  //spaceLoc = resultString.LastIndexOf(" ");
248
276
  if (spaceLoc) // Look at the last word
249
277
  {
250
- char * lastWord = malloc(strlen(spaceLoc));
278
+ char * lastWord = malloc(strlen(spaceLoc)+1);
251
279
  strcpy(lastWord, spaceLoc + 1);
252
280
  if (IsCompanyWord(lastWord))
253
281
  {
@@ -360,6 +388,44 @@ char *trimwhitespace(char *str)
360
388
  return str;
361
389
  }
362
390
 
391
+ char *trimsuffix(char *str, const char *suffix)
392
+ {
393
+ char delims[] = " ";
394
+ char *result = NULL;
395
+ char *workString = malloc(strlen(str)+3);
396
+ char *workBuffer = malloc(strlen(str)+3);
397
+ strcpy(workString, str);
398
+ str[0] = '\0';
399
+ result = strtok(workString, delims);
400
+ while(result != NULL)
401
+ {
402
+ strcpy(workBuffer, result);
403
+ int len = strlen(workBuffer);
404
+ if (len > 3)
405
+ {
406
+ if (workBuffer[len-1] == 'S')
407
+ {
408
+ char * p = strstr(workBuffer, "IES");
409
+ if (p && p[3] == '\0' && strcmp(workBuffer, "SERIES") != 0)
410
+ {
411
+ *p = 'Y';
412
+ *++p = '\0';
413
+ }
414
+ if (strcmp(workBuffer, "PLUS") != 0)
415
+ workBuffer[len-1] = '\0';
416
+ }
417
+ }
418
+ strcat(str, workBuffer);
419
+ result = strtok( NULL, delims );
420
+ if (result)
421
+ strcat(str, " ");
422
+ }
423
+ free(workString);
424
+ free(workBuffer);
425
+ return str;
426
+ }
427
+
428
+
363
429
  // !!!! This ONLY works where rep is longer than with
364
430
  char *str_replace(char *orig, const char *rep, const char *with)
365
431
  {
@@ -2,7 +2,7 @@ module ConsistentCompany
2
2
  module Version
3
3
  MAJOR = 0
4
4
  MINOR = 0
5
- PATCH = 3
5
+ PATCH = 4
6
6
  BUILD = nil
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
@@ -24,7 +24,7 @@ class TestConsistentCompany < Test::Unit::TestCase
24
24
  # empty name
25
25
  assert_equal("", "".company_namer)
26
26
  # a very long name
27
- assert_equal("A"*1000, ("A"*1000).company_namer)
27
+ assert_equal("A"*1000+"NAMEISHERE", (" A"*1000 + 'NAME IS HERE ').company_namer)
28
28
  # parenthesis matching
29
29
  assert_equal("BBEE", ("BB(xx)EE").company_namer)
30
30
  assert_equal("BE", ("B(xx)E").company_namer)
@@ -54,6 +54,10 @@ class TestConsistentCompany < Test::Unit::TestCase
54
54
  # common name shortening
55
55
  assert_equal("TESTCTRCTRCTR", ("Test Center Center Center").company_namer)
56
56
 
57
+ # #singularize
58
+ # assert_equal("TESTNAMEHERE", (" Test Names here").company_namer)
59
+ # assert_equal("TESTBATTERY", ("Test Batteries").company_namer)
60
+
57
61
  assert_equal("My Test Advertising Co".company_namer, "MY TEST ADV COMPANY".company_namer)
58
62
  end
59
63
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: consistent_company
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-20 00:00:00.000000000Z
12
+ date: 2011-09-30 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: shoulda
16
- requirement: &70285303244580 !ruby/object:Gem::Requirement
16
+ requirement: &70187182156920 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70285303244580
24
+ version_requirements: *70187182156920
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bundler
27
- requirement: &70285303243640 !ruby/object:Gem::Requirement
27
+ requirement: &70187182156220 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.0.18
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70285303243640
35
+ version_requirements: *70187182156220
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &70285303242660 !ruby/object:Gem::Requirement
38
+ requirement: &70187182155600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.6.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70285303242660
46
+ version_requirements: *70187182155600
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rcov
49
- requirement: &70285303241720 !ruby/object:Gem::Requirement
49
+ requirement: &70187182154960 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70285303241720
57
+ version_requirements: *70187182154960
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: pry
60
- requirement: &70285303241040 !ruby/object:Gem::Requirement
60
+ requirement: &70187182154360 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70285303241040
68
+ version_requirements: *70187182154360
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake-compiler
71
- requirement: &70285303240300 !ruby/object:Gem::Requirement
71
+ requirement: &70187182153680 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.7.6
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70285303240300
79
+ version_requirements: *70187182153680
80
80
  description: Ruby C Extension to normalize a company name. Useful when company names
81
81
  come from various sources.
82
82
  email: dcleven@marketron.com
@@ -117,7 +117,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
117
117
  version: '0'
118
118
  segments:
119
119
  - 0
120
- hash: -1402416125086284082
120
+ hash: 3110059090780389810
121
121
  required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  none: false
123
123
  requirements: