consistent_company 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -6,6 +6,8 @@ This was written with the Adverting domain in mind. So many of the common advert
6
6
  handled well. Although it can work with other domains you may find need to enhance the specific company name
7
7
  identifiers in IsCompanyName and TransformCompany.
8
8
 
9
+ Tested with over 600,000 names from 100's of sources where an overlap of names is expected with a match rate of 40%.
10
+
9
11
  ## Install:
10
12
  gem install consistent_company
11
13
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{consistent_company}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = [%q{Doug Cleven}]
12
- s.date = %q{2011-09-20}
12
+ s.date = %q{2011-09-30}
13
13
  s.description = %q{Ruby C Extension to normalize a company name. Useful when company names come from various sources.}
14
14
  s.email = %q{dcleven@marketron.com}
15
15
  s.extensions = [%q{ext/consistent_company/extconf.rb}]
@@ -7,6 +7,7 @@
7
7
  char * TransformCompany(char * inString);
8
8
  static int IsCompanyWord(char * inWord);
9
9
  char *trimwhitespace(char *str);
10
+ char *trimsuffix(char *str, const char *suffix);
10
11
  char *str_replace(char *orig, const char *rep, const char *with);
11
12
 
12
13
  static VALUE rb_ConsistentCompany_Init(VALUE self)
@@ -18,6 +19,20 @@ static VALUE rb_ConsistentCompany_Init(VALUE self)
18
19
  static VALUE rb_CompanyNamer(VALUE self)
19
20
  {
20
21
  char * pSelf = RSTRING_PTR(self);
22
+ int selfLen = strlen(pSelf)+2;
23
+ int workLen = selfLen;
24
+ char * s = pSelf;
25
+
26
+ // calc size of work strings
27
+ // while processing we turn & = AND, + = PLUS
28
+ // and we add space at front and back
29
+ while (s = strpbrk(s, "&+"))
30
+ {
31
+ workLen +=3; // worst case we add 3 chars
32
+ s++;
33
+ }
34
+ workLen += 2; // add space front and back
35
+ //////////////
21
36
 
22
37
  // for company only
23
38
  int i;
@@ -25,8 +40,8 @@ static VALUE rb_CompanyNamer(VALUE self)
25
40
  int asc;
26
41
  int numLefts = 0, numRights = 0;
27
42
  int left1 = -1, right1 = -1, left2 = -1, right2 = -1;
28
- char * workString = malloc(strlen(pSelf)+2); // 2 extra chars for TransformCompany
29
- char * returnString = malloc(strlen(pSelf)+2);
43
+ char * workString = malloc(workLen); // 2 extra chars for TransformCompany
44
+ char * returnString = malloc(workLen);
30
45
  char * inString;
31
46
  strcpy(workString, pSelf);
32
47
  inString = workString;
@@ -146,8 +161,17 @@ static VALUE rb_CompanyNamer(VALUE self)
146
161
  strcat(returnString, " ");
147
162
  }
148
163
  }
164
+ // if (strlen(workString) > workLen || strlen(returnString) > workLen)
165
+ // {
166
+ // char buff[200];
167
+ // sprintf(buff, "workLen %d %s workString %d returnString %d %s", workLen, pSelf, strlen(workString), strlen(returnString), returnString);
168
+ // return rb_str_new2(trimwhitespace(buff));
169
+ // }
149
170
  char * p;
150
171
  str_replace(returnString, " AND ", " & ");
172
+
173
+ int oldLen = strlen(returnString);
174
+ // returnString = trimsuffix(returnString, "s");
151
175
  returnString = trimwhitespace(returnString);
152
176
  strcpy(returnString, TransformCompany(returnString));
153
177
  VALUE return_value = rb_str_new2(trimwhitespace(returnString));
@@ -166,7 +190,7 @@ FIRST FEDERAL SAVINGS becomes 1ST FEDERAL SAVINGS
166
190
  char * TransformCompany(char * resultString)
167
191
  {
168
192
  // resultString should have been allocated with 2 extra char for our padding here
169
- char * buf = malloc(strlen(resultString));
193
+ char * buf = malloc(strlen(resultString)+3);
170
194
  strcpy(buf, " ");
171
195
  strcat(buf,resultString);
172
196
  strcat(buf, " ");
@@ -203,6 +227,10 @@ char * TransformCompany(char * resultString)
203
227
  str_replace(s, " TENTH ", " 10TH ");
204
228
  str_replace(s, " CENTRE ", " CTR ");
205
229
  str_replace(s, " CENTER ", " CTR ");
230
+ str_replace(s, " CNTR ", " CTR ");
231
+ str_replace(s, " CTR ", " CTR ");
232
+ str_replace(s, " CENT ", " CTR ");
233
+ str_replace(s, " CENTR ", " CTR ");
206
234
  str_replace(s, " AUTOMOTIVE ", " AUTO ");
207
235
  str_replace(s, " AUTOMOBILE ", " AUTO ");
208
236
  str_replace(s, " AUTOS ", " AUTO ");
@@ -247,7 +275,7 @@ char * TransformCompany(char * resultString)
247
275
  //spaceLoc = resultString.LastIndexOf(" ");
248
276
  if (spaceLoc) // Look at the last word
249
277
  {
250
- char * lastWord = malloc(strlen(spaceLoc));
278
+ char * lastWord = malloc(strlen(spaceLoc)+1);
251
279
  strcpy(lastWord, spaceLoc + 1);
252
280
  if (IsCompanyWord(lastWord))
253
281
  {
@@ -360,6 +388,44 @@ char *trimwhitespace(char *str)
360
388
  return str;
361
389
  }
362
390
 
391
+ char *trimsuffix(char *str, const char *suffix)
392
+ {
393
+ char delims[] = " ";
394
+ char *result = NULL;
395
+ char *workString = malloc(strlen(str)+3);
396
+ char *workBuffer = malloc(strlen(str)+3);
397
+ strcpy(workString, str);
398
+ str[0] = '\0';
399
+ result = strtok(workString, delims);
400
+ while(result != NULL)
401
+ {
402
+ strcpy(workBuffer, result);
403
+ int len = strlen(workBuffer);
404
+ if (len > 3)
405
+ {
406
+ if (workBuffer[len-1] == 'S')
407
+ {
408
+ char * p = strstr(workBuffer, "IES");
409
+ if (p && p[3] == '\0' && strcmp(workBuffer, "SERIES") != 0)
410
+ {
411
+ *p = 'Y';
412
+ *++p = '\0';
413
+ }
414
+ if (strcmp(workBuffer, "PLUS") != 0)
415
+ workBuffer[len-1] = '\0';
416
+ }
417
+ }
418
+ strcat(str, workBuffer);
419
+ result = strtok( NULL, delims );
420
+ if (result)
421
+ strcat(str, " ");
422
+ }
423
+ free(workString);
424
+ free(workBuffer);
425
+ return str;
426
+ }
427
+
428
+
363
429
  // !!!! This ONLY works where rep is longer than with
364
430
  char *str_replace(char *orig, const char *rep, const char *with)
365
431
  {
@@ -2,7 +2,7 @@ module ConsistentCompany
2
2
  module Version
3
3
  MAJOR = 0
4
4
  MINOR = 0
5
- PATCH = 3
5
+ PATCH = 4
6
6
  BUILD = nil
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
@@ -24,7 +24,7 @@ class TestConsistentCompany < Test::Unit::TestCase
24
24
  # empty name
25
25
  assert_equal("", "".company_namer)
26
26
  # a very long name
27
- assert_equal("A"*1000, ("A"*1000).company_namer)
27
+ assert_equal("A"*1000+"NAMEISHERE", (" A"*1000 + 'NAME IS HERE ').company_namer)
28
28
  # parenthesis matching
29
29
  assert_equal("BBEE", ("BB(xx)EE").company_namer)
30
30
  assert_equal("BE", ("B(xx)E").company_namer)
@@ -54,6 +54,10 @@ class TestConsistentCompany < Test::Unit::TestCase
54
54
  # common name shortening
55
55
  assert_equal("TESTCTRCTRCTR", ("Test Center Center Center").company_namer)
56
56
 
57
+ # #singularize
58
+ # assert_equal("TESTNAMEHERE", (" Test Names here").company_namer)
59
+ # assert_equal("TESTBATTERY", ("Test Batteries").company_namer)
60
+
57
61
  assert_equal("My Test Advertising Co".company_namer, "MY TEST ADV COMPANY".company_namer)
58
62
  end
59
63
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: consistent_company
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-20 00:00:00.000000000Z
12
+ date: 2011-09-30 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: shoulda
16
- requirement: &70285303244580 !ruby/object:Gem::Requirement
16
+ requirement: &70187182156920 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70285303244580
24
+ version_requirements: *70187182156920
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bundler
27
- requirement: &70285303243640 !ruby/object:Gem::Requirement
27
+ requirement: &70187182156220 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.0.18
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70285303243640
35
+ version_requirements: *70187182156220
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: jeweler
38
- requirement: &70285303242660 !ruby/object:Gem::Requirement
38
+ requirement: &70187182155600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.6.4
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70285303242660
46
+ version_requirements: *70187182155600
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rcov
49
- requirement: &70285303241720 !ruby/object:Gem::Requirement
49
+ requirement: &70187182154960 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70285303241720
57
+ version_requirements: *70187182154960
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: pry
60
- requirement: &70285303241040 !ruby/object:Gem::Requirement
60
+ requirement: &70187182154360 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70285303241040
68
+ version_requirements: *70187182154360
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake-compiler
71
- requirement: &70285303240300 !ruby/object:Gem::Requirement
71
+ requirement: &70187182153680 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.7.6
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70285303240300
79
+ version_requirements: *70187182153680
80
80
  description: Ruby C Extension to normalize a company name. Useful when company names
81
81
  come from various sources.
82
82
  email: dcleven@marketron.com
@@ -117,7 +117,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
117
117
  version: '0'
118
118
  segments:
119
119
  - 0
120
- hash: -1402416125086284082
120
+ hash: 3110059090780389810
121
121
  required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  none: false
123
123
  requirements: