geo_coder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +12 -0
- data/Gemfile.lock +32 -0
- data/History.txt +6 -0
- data/Makefile +13 -0
- data/Manifest.txt +18 -0
- data/README.rdoc +197 -0
- data/Rakefile +53 -0
- data/TODO.txt +8 -0
- data/VERSION +1 -0
- data/bin/build_indexes +8 -0
- data/bin/rebuild_cluster +22 -0
- data/bin/rebuild_metaphones +23 -0
- data/bin/tiger_import +59 -0
- data/demos/demo/app/ext/geocodewrap.rb +84 -0
- data/demos/demo/app/views/index.builder +13 -0
- data/demos/demo/app/views/index.erb +71 -0
- data/demos/demo/config.ru +12 -0
- data/demos/demo/config/bootstraps.rb +130 -0
- data/demos/demo/config/geoenvironment.rb +25 -0
- data/demos/demo/geocoder_helper.rb +12 -0
- data/demos/demo/geocom_geocode.rb +10 -0
- data/demos/demo/main.rb +3 -0
- data/demos/demo/rakefile.rb +17 -0
- data/demos/demo/tmp/restart.txt +0 -0
- data/demos/simpledemo/views/index.builder +13 -0
- data/demos/simpledemo/views/index.erb +69 -0
- data/demos/simpledemo/ws.rb +83 -0
- data/doc/Makefile +7 -0
- data/doc/html4css1.css +279 -0
- data/doc/lookup.rst +193 -0
- data/doc/parsing.rst +125 -0
- data/doc/voidspace.css +147 -0
- data/geo_coder.gemspec +172 -0
- data/lib/geocoder/us.rb +21 -0
- data/lib/geocoder/us/address.rb +290 -0
- data/lib/geocoder/us/constants.rb +670 -0
- data/lib/geocoder/us/database.rb +745 -0
- data/lib/geocoder/us/import.rb +181 -0
- data/lib/geocoder/us/import/tiger.rb +13 -0
- data/lib/geocoder/us/numbers.rb +58 -0
- data/navteq/README +4 -0
- data/navteq/convert.sql +37 -0
- data/navteq/navteq_import +39 -0
- data/navteq/prepare.sql +92 -0
- data/sql/cluster.sql +16 -0
- data/sql/convert.sql +80 -0
- data/sql/create.sql +37 -0
- data/sql/index.sql +12 -0
- data/sql/place.csv +104944 -0
- data/sql/place.sql +104948 -0
- data/sql/setup.sql +78 -0
- data/src/Makefile +13 -0
- data/src/README +14 -0
- data/src/liblwgeom/Makefile +75 -0
- data/src/liblwgeom/box2d.c +54 -0
- data/src/liblwgeom/lex.yy.c +4799 -0
- data/src/liblwgeom/liblwgeom.h +1405 -0
- data/src/liblwgeom/lwalgorithm.c +946 -0
- data/src/liblwgeom/lwalgorithm.h +52 -0
- data/src/liblwgeom/lwcircstring.c +759 -0
- data/src/liblwgeom/lwcollection.c +541 -0
- data/src/liblwgeom/lwcompound.c +118 -0
- data/src/liblwgeom/lwcurvepoly.c +86 -0
- data/src/liblwgeom/lwgeom.c +886 -0
- data/src/liblwgeom/lwgeom_api.c +2201 -0
- data/src/liblwgeom/lwgparse.c +1219 -0
- data/src/liblwgeom/lwgunparse.c +1054 -0
- data/src/liblwgeom/lwline.c +525 -0
- data/src/liblwgeom/lwmcurve.c +125 -0
- data/src/liblwgeom/lwmline.c +137 -0
- data/src/liblwgeom/lwmpoint.c +138 -0
- data/src/liblwgeom/lwmpoly.c +141 -0
- data/src/liblwgeom/lwmsurface.c +129 -0
- data/src/liblwgeom/lwpoint.c +439 -0
- data/src/liblwgeom/lwpoly.c +579 -0
- data/src/liblwgeom/lwsegmentize.c +1047 -0
- data/src/liblwgeom/lwutil.c +369 -0
- data/src/liblwgeom/measures.c +861 -0
- data/src/liblwgeom/postgis_config.h +93 -0
- data/src/liblwgeom/ptarray.c +847 -0
- data/src/liblwgeom/vsprintf.c +179 -0
- data/src/liblwgeom/wktparse.h +126 -0
- data/src/liblwgeom/wktparse.lex +74 -0
- data/src/liblwgeom/wktparse.tab.c +2353 -0
- data/src/liblwgeom/wktparse.tab.h +145 -0
- data/src/liblwgeom/wktparse.y +385 -0
- data/src/libsqlite3_geocoder/Makefile +22 -0
- data/src/libsqlite3_geocoder/Makefile.nix +15 -0
- data/src/libsqlite3_geocoder/Makefile.redhat +15 -0
- data/src/libsqlite3_geocoder/extension.c +121 -0
- data/src/libsqlite3_geocoder/extension.h +13 -0
- data/src/libsqlite3_geocoder/levenshtein.c +42 -0
- data/src/libsqlite3_geocoder/metaphon.c +278 -0
- data/src/libsqlite3_geocoder/util.c +37 -0
- data/src/libsqlite3_geocoder/wkb_compress.c +54 -0
- data/src/metaphone/Makefile +7 -0
- data/src/metaphone/README +49 -0
- data/src/metaphone/extension.c +37 -0
- data/src/metaphone/metaphon.c +251 -0
- data/src/shp2sqlite/Makefile +37 -0
- data/src/shp2sqlite/Makefile.nix +36 -0
- data/src/shp2sqlite/Makefile.redhat +35 -0
- data/src/shp2sqlite/dbfopen.c +1595 -0
- data/src/shp2sqlite/getopt.c +695 -0
- data/src/shp2sqlite/getopt.h +127 -0
- data/src/shp2sqlite/shapefil.h +500 -0
- data/src/shp2sqlite/shp2sqlite.c +1974 -0
- data/src/shp2sqlite/shpopen.c +1894 -0
- data/tests/address.rb +236 -0
- data/tests/benchmark.rb +20 -0
- data/tests/constants.rb +57 -0
- data/tests/data/address-sample.csv +52 -0
- data/tests/data/db-test.csv +57 -0
- data/tests/data/locations.csv +4 -0
- data/tests/database.rb +137 -0
- data/tests/generate.rb +34 -0
- data/tests/numbers.rb +46 -0
- data/tests/run.rb +11 -0
- metadata +237 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
all: libsqlite3_geocoder.so
|
|
2
|
+
CC=gcc -fPIC
|
|
3
|
+
libsqlite3_geocoder.so: extension.o wkb_compress.o util.o metaphon.o
|
|
4
|
+
$(CC) -lsqlite3 -lm -I/usr/include -shared $^ -o $@
|
|
5
|
+
test: wkb_compress.c
|
|
6
|
+
$(CC) -DTEST -o wkb_compress $^
|
|
7
|
+
clean:
|
|
8
|
+
rm -f *.o *.so wkb_compress
|
|
9
|
+
|
|
10
|
+
libsqlite3_geocoder.so: extension.o wkb_compress.o util.o metaphon.o levenshtein.o
|
|
11
|
+
$(CC) -lsqlite3 -lm -I/usr/include -shared $^ -o $@
|
|
12
|
+
|
|
13
|
+
test: test_wkb_compress test_levenshtein
|
|
14
|
+
|
|
15
|
+
test_wkb_compress: wkb_compress.c
|
|
16
|
+
$(CC) -DTEST -o wkb_compress $^
|
|
17
|
+
|
|
18
|
+
test_levenshtein: levenshtein.c
|
|
19
|
+
$(CC) -DTEST -o levenshtein $^
|
|
20
|
+
|
|
21
|
+
clean:
|
|
22
|
+
rm -f *.o *.so wkb_compress levenshtein
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
all: libsqlite3_geocoder.so
|
|
2
|
+
|
|
3
|
+
libsqlite3_geocoder.so: extension.o wkb_compress.o util.o metaphon.o levenshtein.o
|
|
4
|
+
$(CC) -shared $^ -o $@
|
|
5
|
+
|
|
6
|
+
test: test_wkb_compress test_levenshtein
|
|
7
|
+
|
|
8
|
+
test_wkb_compress: wkb_compress.c
|
|
9
|
+
$(CC) -DTEST -o wkb_compress $^
|
|
10
|
+
|
|
11
|
+
test_levenshtein: levenshtein.c
|
|
12
|
+
$(CC) -DTEST -o levenshtein $^
|
|
13
|
+
|
|
14
|
+
clean:
|
|
15
|
+
rm -f *.o *.so wkb_compress levenshtein
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
all: libsqlite3_geocoder.so
|
|
2
|
+
CFLAGS=-fPIC
|
|
3
|
+
libsqlite3_geocoder.so: extension.o wkb_compress.o util.o metaphon.o levenshtein.o
|
|
4
|
+
$(CC) $(CFLAGS) -shared $^ -o $@
|
|
5
|
+
|
|
6
|
+
test: test_wkb_compress test_levenshtein
|
|
7
|
+
|
|
8
|
+
test_wkb_compress: wkb_compress.c
|
|
9
|
+
$(CC) -DTEST -o wkb_compress $^
|
|
10
|
+
|
|
11
|
+
test_levenshtein: levenshtein.c
|
|
12
|
+
$(CC) -DTEST -o levenshtein $^
|
|
13
|
+
|
|
14
|
+
clean:
|
|
15
|
+
rm -f *.o *.so wkb_compress levenshtein
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# include <sqlite3ext.h>
|
|
2
|
+
# include <stdio.h>
|
|
3
|
+
# include <string.h>
|
|
4
|
+
# include <assert.h>
|
|
5
|
+
# include <math.h>
|
|
6
|
+
|
|
7
|
+
# include "extension.h"
|
|
8
|
+
|
|
9
|
+
static SQLITE_EXTENSION_INIT1;
|
|
10
|
+
|
|
11
|
+
static void
|
|
12
|
+
sqlite3_metaphone (sqlite3_context *context, int argc, sqlite3_value **argv) {
|
|
13
|
+
const unsigned char *input = sqlite3_value_text(argv[0]);
|
|
14
|
+
int max_phones = 0;
|
|
15
|
+
char *output;
|
|
16
|
+
int len;
|
|
17
|
+
if (sqlite3_value_type(argv[0]) == SQLITE_NULL) {
|
|
18
|
+
sqlite3_result_null(context);
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
if (argc > 1)
|
|
22
|
+
max_phones = sqlite3_value_int(argv[1]);
|
|
23
|
+
if (max_phones <= 0)
|
|
24
|
+
max_phones = strlen(input);
|
|
25
|
+
output = sqlite3_malloc((max_phones+1)*sizeof(char));
|
|
26
|
+
len = metaphone(input, output, max_phones);
|
|
27
|
+
sqlite3_result_text(context, output, len, sqlite3_free);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
static void
|
|
31
|
+
sqlite3_levenshtein (sqlite3_context *context, int argc, sqlite3_value **argv) {
|
|
32
|
+
const unsigned char *s1 = sqlite3_value_text(argv[0]),
|
|
33
|
+
*s2 = sqlite3_value_text(argv[1]);
|
|
34
|
+
double dist;
|
|
35
|
+
if (sqlite3_value_type(argv[0]) == SQLITE_NULL ||
|
|
36
|
+
sqlite3_value_type(argv[1]) == SQLITE_NULL) {
|
|
37
|
+
sqlite3_result_null(context);
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
dist = levenshtein_distance(s1, s2);
|
|
41
|
+
sqlite3_result_double(context, dist);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
static void
|
|
45
|
+
sqlite3_digit_suffix (sqlite3_context *context,
|
|
46
|
+
int argc, sqlite3_value **argv) {
|
|
47
|
+
if (sqlite3_value_type(argv[0]) == SQLITE_NULL) {
|
|
48
|
+
sqlite3_result_null(context);
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
const unsigned char *input = sqlite3_value_text(argv[0]);
|
|
52
|
+
char *output = sqlite3_malloc((strlen(input)+1) * sizeof(char));
|
|
53
|
+
size_t len = digit_suffix(input, output);
|
|
54
|
+
sqlite3_result_text(context, output, len, sqlite3_free);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
static void
|
|
58
|
+
sqlite3_nondigit_prefix (sqlite3_context *context,
|
|
59
|
+
int argc, sqlite3_value **argv) {
|
|
60
|
+
if (sqlite3_value_type(argv[0]) == SQLITE_NULL) {
|
|
61
|
+
sqlite3_result_null(context);
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
const unsigned char *input = sqlite3_value_text(argv[0]);
|
|
65
|
+
char *output = sqlite3_malloc((strlen(input)+1) * sizeof(char));
|
|
66
|
+
size_t len = nondigit_prefix(input, output);
|
|
67
|
+
sqlite3_result_text(context, output, len, sqlite3_free);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
static void
|
|
72
|
+
sqlite3_compress_wkb_line (sqlite3_context *context,
|
|
73
|
+
int argc, sqlite3_value **argv) {
|
|
74
|
+
if (sqlite3_value_type(argv[0]) == SQLITE_NULL) {
|
|
75
|
+
sqlite3_result_null(context);
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
unsigned long input_len = sqlite3_value_bytes(argv[0]);
|
|
79
|
+
const void *input = sqlite3_value_blob(argv[0]);
|
|
80
|
+
unsigned long output_len = ceil((input_len-9)/8.0) * 4;
|
|
81
|
+
unsigned long len = 0;
|
|
82
|
+
void *output = sqlite3_malloc(output_len);
|
|
83
|
+
len = compress_wkb_line(output, input, input_len);
|
|
84
|
+
assert(len == output_len);
|
|
85
|
+
sqlite3_result_blob(context, output, len, sqlite3_free);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static void
|
|
89
|
+
sqlite3_uncompress_wkb_line (sqlite3_context *context,
|
|
90
|
+
int argc, sqlite3_value **argv) {
|
|
91
|
+
unsigned long input_len = sqlite3_value_bytes(argv[0]);
|
|
92
|
+
const void *input = sqlite3_value_blob(argv[0]);
|
|
93
|
+
unsigned long output_len = input_len*2+9;
|
|
94
|
+
unsigned long len = 0;
|
|
95
|
+
void *output = sqlite3_malloc(output_len);
|
|
96
|
+
len = uncompress_wkb_line(output, input, input_len);
|
|
97
|
+
assert(len == output_len);
|
|
98
|
+
sqlite3_result_blob(context, output, len, sqlite3_free);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
int sqlite3_extension_init (sqlite3 * db, char **pzErrMsg,
|
|
102
|
+
const sqlite3_api_routines *pApi) {
|
|
103
|
+
SQLITE_EXTENSION_INIT2(pApi);
|
|
104
|
+
|
|
105
|
+
sqlite3_create_function(db, "metaphone", 1, SQLITE_ANY,
|
|
106
|
+
NULL, sqlite3_metaphone, NULL, NULL);
|
|
107
|
+
sqlite3_create_function(db, "metaphone", 2, SQLITE_ANY,
|
|
108
|
+
NULL, sqlite3_metaphone, NULL, NULL);
|
|
109
|
+
|
|
110
|
+
sqlite3_create_function(db, "levenshtein", 2, SQLITE_ANY,
|
|
111
|
+
NULL, sqlite3_levenshtein, NULL, NULL);
|
|
112
|
+
sqlite3_create_function(db, "compress_wkb_line", 1, SQLITE_ANY,
|
|
113
|
+
NULL, sqlite3_compress_wkb_line, NULL, NULL);
|
|
114
|
+
sqlite3_create_function(db, "uncompress_wkb_line", 1, SQLITE_ANY,
|
|
115
|
+
NULL, sqlite3_uncompress_wkb_line, NULL, NULL);
|
|
116
|
+
sqlite3_create_function(db, "digit_suffix", 1, SQLITE_ANY,
|
|
117
|
+
NULL, sqlite3_digit_suffix, NULL, NULL);
|
|
118
|
+
sqlite3_create_function(db, "nondigit_prefix", 1, SQLITE_ANY,
|
|
119
|
+
NULL, sqlite3_nondigit_prefix, NULL, NULL);
|
|
120
|
+
return 0;
|
|
121
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#ifndef SQLITE3_GEOCODER
|
|
2
|
+
#define SQLITE3_GEOCODER
|
|
3
|
+
|
|
4
|
+
#include <stdint.h>
|
|
5
|
+
|
|
6
|
+
int metaphone(const char *Word, char *Metaph, int max_phones);
|
|
7
|
+
double levenshtein_distance (const unsigned char *s1, const unsigned char *s2);
|
|
8
|
+
signed int rindex_nondigit (const char *string);
|
|
9
|
+
signed int nondigit_prefix (const char *input, char *output);
|
|
10
|
+
uint32_t compress_wkb_line (void *dest, const void *src, uint32_t len);
|
|
11
|
+
uint32_t uncompress_wkb_line (void *dest, const void *src, uint32_t len);
|
|
12
|
+
|
|
13
|
+
#endif
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# include <string.h>
|
|
2
|
+
# define STRLEN_MAX 256
|
|
3
|
+
# define min(x, y) ((x) < (y) ? (x) : (y))
|
|
4
|
+
# define max(x, y) ((x) > (y) ? (x) : (y))
|
|
5
|
+
# define NO_CASE (~(unsigned char)32)
|
|
6
|
+
# define eql(x, y) (((x) & NO_CASE) == ((y) & NO_CASE))
|
|
7
|
+
|
|
8
|
+
static int d[STRLEN_MAX][STRLEN_MAX]; // this isn't thread safe
|
|
9
|
+
|
|
10
|
+
double levenshtein_distance (const unsigned char *s1, const unsigned char *s2) {
|
|
11
|
+
const size_t len1 = min(strlen(s1), STRLEN_MAX-1),
|
|
12
|
+
len2 = min(strlen(s2), STRLEN_MAX-1);
|
|
13
|
+
int cost, i, j;
|
|
14
|
+
|
|
15
|
+
for (i = 1; i <= len1; ++i) d[i][0] = i;
|
|
16
|
+
for (i = 1; i <= len2; ++i) d[0][i] = i;
|
|
17
|
+
for (i = 1; i <= len1; ++i) {
|
|
18
|
+
for (j = 1; j <= len2; ++j) {
|
|
19
|
+
cost = (eql(s1[i-1], s2[j-1]) ? 0 : 1);
|
|
20
|
+
d[i][j] = min(min(
|
|
21
|
+
d[i-1][j ] + 1, /* deletion */
|
|
22
|
+
d[i ][j-1] + 1), /* insertion */
|
|
23
|
+
d[i-1][j-1] + cost); /* substitution */
|
|
24
|
+
if (i > 1 && j > 1 && eql(s1[i-1], s2[j-2]) && eql(s1[i-2], s2[j-1])) {
|
|
25
|
+
d[i][j] = min( d[i][j],
|
|
26
|
+
d[i-2][j-2] + cost ); /* transposition */
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return (d[len1][len2] / (double) max(len1, len2));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#ifdef TEST
|
|
34
|
+
#include <stdio.h>
|
|
35
|
+
|
|
36
|
+
int main (int argc, char **argv) {
|
|
37
|
+
if (argc < 3) return -1;
|
|
38
|
+
printf("%.1f%%\n", levenshtein_distance(argv[1],argv[2]) * 100);
|
|
39
|
+
return 0;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
#endif
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
/* +++Customized by SDE for sqlite3 use 09-Mar-2009 */
|
|
2
|
+
/* +++File obtained from http://www.shedai.net/c/new/METAPHON.C */
|
|
3
|
+
/* +++Date previously modified: 05-Jul-1997 */
|
|
4
|
+
|
|
5
|
+
/*
|
|
6
|
+
** METAPHON.C - Phonetic string matching
|
|
7
|
+
**
|
|
8
|
+
** The Metaphone algorithm was developed by Lawrence Phillips. Like the
|
|
9
|
+
** Soundex algorithm, it compares words that sound alike but are spelled
|
|
10
|
+
** differently. Metaphone was designed to overcome difficulties encountered
|
|
11
|
+
** with Soundex.
|
|
12
|
+
**
|
|
13
|
+
** This implementation was written by Gary A. Parker and originally published
|
|
14
|
+
** in the June/July, 1991 (vol. 5 nr. 4) issue of C Gazette. As published,
|
|
15
|
+
** this code was explicitly placed in the public domain by the author.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#include <ctype.h>
|
|
19
|
+
#include <string.h> /* strlen() */
|
|
20
|
+
#include <stdio.h>
|
|
21
|
+
#define malloc(x) sqlite3_malloc((x))
|
|
22
|
+
#define free(x) sqlite3_free((x))
|
|
23
|
+
|
|
24
|
+
/*
|
|
25
|
+
** Character coding array
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
static char vsvfn[26] = {
|
|
29
|
+
1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
|
|
30
|
+
/* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
|
|
31
|
+
|
|
32
|
+
/*
|
|
33
|
+
** Macros to access the character coding array
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
#define vowel(x) (vsvfn[(x) - 'A'] & 1) /* AEIOU */
|
|
37
|
+
#define same(x) (vsvfn[(x) - 'A'] & 2) /* FJLMNR */
|
|
38
|
+
#define varson(x) (vsvfn[(x) - 'A'] & 4) /* CGPST */
|
|
39
|
+
#define frontv(x) (vsvfn[(x) - 'A'] & 8) /* EIY */
|
|
40
|
+
#define noghf(x) (vsvfn[(x) - 'A'] & 16) /* BDH */
|
|
41
|
+
|
|
42
|
+
int metaphone(const char *Word, char *Metaph, int max_phones) {
|
|
43
|
+
char *n, *n_start, *n_end; /* Pointers to string */
|
|
44
|
+
char *metaph_start = Metaph, *metaph_end;
|
|
45
|
+
/* Pointers to metaph */
|
|
46
|
+
int ntrans_len = strlen(Word)+4;
|
|
47
|
+
char *ntrans = (char *)malloc(sizeof(char) * ntrans_len);
|
|
48
|
+
/* Word with uppercase letters */
|
|
49
|
+
int KSflag; /* State flag for X translation */
|
|
50
|
+
|
|
51
|
+
/* SDE -- special case: if the word starts with a number, just
|
|
52
|
+
* copy the leading digits and return. This means we don't
|
|
53
|
+
* metaphone cardinal number suffixes (i.e. "st","nd","rd") */
|
|
54
|
+
int leading_digit = isdigit(*Word);
|
|
55
|
+
/* SDE -- check for a leading semivowel. needed because
|
|
56
|
+
* the copy in ntrans gets destroyed by the metaphone process. */
|
|
57
|
+
char leading_semivowel = '\0';
|
|
58
|
+
|
|
59
|
+
/*
|
|
60
|
+
** Copy word to internal buffer, dropping non-alphabetic characters
|
|
61
|
+
** and converting to upper case.
|
|
62
|
+
*/
|
|
63
|
+
for (n = ntrans + 1, n_end = ntrans + ntrans_len - 2;
|
|
64
|
+
*Word && n < n_end; ++Word)
|
|
65
|
+
{
|
|
66
|
+
/* SDE -- see previous comment */
|
|
67
|
+
if (leading_digit && isalpha(*Word))
|
|
68
|
+
break;
|
|
69
|
+
/* SDE -- copy numbers as well, for geocoding street names */
|
|
70
|
+
/* was: if (isalpha(*Word)) */
|
|
71
|
+
if (isalnum(*Word))
|
|
72
|
+
*n++ = toupper(*Word);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (n == ntrans + 1) {
|
|
76
|
+
free(ntrans);
|
|
77
|
+
Metaph[0]='\0';
|
|
78
|
+
return 0; /* Return if zero characters */
|
|
79
|
+
}
|
|
80
|
+
else n_end = n; /* Set end of string pointer */
|
|
81
|
+
|
|
82
|
+
/*
|
|
83
|
+
** Pad with '\0's, front and rear
|
|
84
|
+
*/
|
|
85
|
+
|
|
86
|
+
*n++ = '\0';
|
|
87
|
+
*n = '\0';
|
|
88
|
+
n = ntrans;
|
|
89
|
+
*n++ = '\0';
|
|
90
|
+
|
|
91
|
+
/* SDE: check for leading semivowel here */
|
|
92
|
+
if (ntrans[1] == 'W' || ntrans[1] == 'Y')
|
|
93
|
+
leading_semivowel = ntrans[1];
|
|
94
|
+
|
|
95
|
+
/*
|
|
96
|
+
** Check for PN, KN, GN, WR, WH, and X at start
|
|
97
|
+
*/
|
|
98
|
+
|
|
99
|
+
switch (*n)
|
|
100
|
+
{
|
|
101
|
+
case 'P':
|
|
102
|
+
case 'K':
|
|
103
|
+
case 'G':
|
|
104
|
+
if ('N' == *(n + 1))
|
|
105
|
+
*n++ = '\0';
|
|
106
|
+
break;
|
|
107
|
+
|
|
108
|
+
case 'A':
|
|
109
|
+
if ('E' == *(n + 1))
|
|
110
|
+
*n++ = '\0';
|
|
111
|
+
break;
|
|
112
|
+
|
|
113
|
+
case 'W':
|
|
114
|
+
if ('R' == *(n + 1))
|
|
115
|
+
*n++ = '\0';
|
|
116
|
+
else if ('H' == *(n + 1))
|
|
117
|
+
{
|
|
118
|
+
*(n + 1) = *n;
|
|
119
|
+
*n++ = '\0';
|
|
120
|
+
}
|
|
121
|
+
break;
|
|
122
|
+
|
|
123
|
+
case 'X':
|
|
124
|
+
*n = 'S';
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/*
|
|
129
|
+
** Now loop through the string, stopping at the end of the string
|
|
130
|
+
** or when the computed Metaphone code is max_phones characters long.
|
|
131
|
+
*/
|
|
132
|
+
|
|
133
|
+
KSflag = 0; /* State flag for KStranslation */
|
|
134
|
+
for (metaph_end = Metaph + max_phones, n_start = n;
|
|
135
|
+
n <= n_end && Metaph < metaph_end; ++n)
|
|
136
|
+
{
|
|
137
|
+
if (KSflag)
|
|
138
|
+
{
|
|
139
|
+
KSflag = 0;
|
|
140
|
+
*Metaph++ = *n;
|
|
141
|
+
}
|
|
142
|
+
else
|
|
143
|
+
{
|
|
144
|
+
/* SDE -- special case: copy numbers verbatim */
|
|
145
|
+
if (isdigit(*n)) {
|
|
146
|
+
*Metaph++ = *n;
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/* Drop duplicates except for CC */
|
|
151
|
+
if (*(n - 1) == *n && *n != 'C')
|
|
152
|
+
continue;
|
|
153
|
+
|
|
154
|
+
/* Check for F J L M N R or first letter vowel */
|
|
155
|
+
|
|
156
|
+
if (same(*n) || (n == n_start && vowel(*n)))
|
|
157
|
+
*Metaph++ = *n;
|
|
158
|
+
else switch (*n)
|
|
159
|
+
{
|
|
160
|
+
case 'B':
|
|
161
|
+
if (n < n_end || *(n - 1) != 'M')
|
|
162
|
+
*Metaph++ = *n;
|
|
163
|
+
break;
|
|
164
|
+
|
|
165
|
+
case 'C':
|
|
166
|
+
if (*(n - 1) != 'S' || !frontv(*(n + 1)))
|
|
167
|
+
{
|
|
168
|
+
if ('I' == *(n + 1) && 'A' == *(n + 2))
|
|
169
|
+
*Metaph++ = 'X';
|
|
170
|
+
else if (frontv(*(n + 1)))
|
|
171
|
+
*Metaph++ = 'S';
|
|
172
|
+
else if ('H' == *(n + 1))
|
|
173
|
+
*Metaph++ = ((n == n_start &&
|
|
174
|
+
!vowel(*(n + 2))) ||
|
|
175
|
+
'S' == *(n - 1)) ? 'K' : 'X';
|
|
176
|
+
else *Metaph++ = 'K';
|
|
177
|
+
}
|
|
178
|
+
break;
|
|
179
|
+
|
|
180
|
+
case 'D':
|
|
181
|
+
*Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2))) ?
|
|
182
|
+
'J' : 'T';
|
|
183
|
+
break;
|
|
184
|
+
|
|
185
|
+
case 'G':
|
|
186
|
+
if ((*(n + 1) != 'H' || vowel(*(n + 2))) &&
|
|
187
|
+
(*(n + 1) != 'N' || ((n + 1) < n_end &&
|
|
188
|
+
(*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
|
|
189
|
+
(*(n - 1) != 'D' || !frontv(*(n + 1))))
|
|
190
|
+
{
|
|
191
|
+
*Metaph++ = (frontv(*(n + 1)) &&
|
|
192
|
+
*(n + 2) != 'G') ? 'J' : 'K';
|
|
193
|
+
}
|
|
194
|
+
else if ('H' == *(n + 1) && !noghf(*(n - 3)) &&
|
|
195
|
+
*(n - 4) != 'H')
|
|
196
|
+
{
|
|
197
|
+
*Metaph++ = 'F';
|
|
198
|
+
}
|
|
199
|
+
break;
|
|
200
|
+
|
|
201
|
+
case 'H':
|
|
202
|
+
if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
|
|
203
|
+
vowel(*(n + 1))))
|
|
204
|
+
{
|
|
205
|
+
*Metaph++ = 'H';
|
|
206
|
+
}
|
|
207
|
+
break;
|
|
208
|
+
|
|
209
|
+
case 'K':
|
|
210
|
+
if (*(n - 1) != 'C')
|
|
211
|
+
*Metaph++ = 'K';
|
|
212
|
+
break;
|
|
213
|
+
|
|
214
|
+
case 'P':
|
|
215
|
+
*Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P';
|
|
216
|
+
break;
|
|
217
|
+
|
|
218
|
+
case 'Q':
|
|
219
|
+
*Metaph++ = 'K';
|
|
220
|
+
break;
|
|
221
|
+
|
|
222
|
+
case 'S':
|
|
223
|
+
*Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1) &&
|
|
224
|
+
('O' == *(n + 2) || 'A' == *(n + 2)))) ?
|
|
225
|
+
'X' : 'S';
|
|
226
|
+
break;
|
|
227
|
+
|
|
228
|
+
case 'T':
|
|
229
|
+
if ('I' == *(n + 1) && ('O' == *(n + 2) ||
|
|
230
|
+
'A' == *(n + 2)))
|
|
231
|
+
{
|
|
232
|
+
*Metaph++ = 'X';
|
|
233
|
+
}
|
|
234
|
+
else if ('H' == *(n + 1))
|
|
235
|
+
/* SDE: was:
|
|
236
|
+
*Metaph++ = 'O';
|
|
237
|
+
but that's WRONG. */
|
|
238
|
+
*Metaph++ = '0';
|
|
239
|
+
else if (*(n + 1) != 'C' || *(n + 2) != 'H')
|
|
240
|
+
*Metaph++ = 'T';
|
|
241
|
+
break;
|
|
242
|
+
|
|
243
|
+
case 'V':
|
|
244
|
+
*Metaph++ = 'F';
|
|
245
|
+
break;
|
|
246
|
+
|
|
247
|
+
case 'W':
|
|
248
|
+
case 'Y':
|
|
249
|
+
if (vowel(*(n + 1)))
|
|
250
|
+
*Metaph++ = *n;
|
|
251
|
+
break;
|
|
252
|
+
|
|
253
|
+
case 'X':
|
|
254
|
+
if (n == n_start)
|
|
255
|
+
*Metaph++ = 'S';
|
|
256
|
+
else
|
|
257
|
+
{
|
|
258
|
+
*Metaph++ = 'K';
|
|
259
|
+
KSflag = 1;
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
|
|
263
|
+
case 'Z':
|
|
264
|
+
*Metaph++ = 'S';
|
|
265
|
+
break;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/* SDE: special case: if word consists solely of W or Y, use that. */
|
|
271
|
+
if (Metaph == metaph_start && leading_semivowel)
|
|
272
|
+
*Metaph++ = leading_semivowel;
|
|
273
|
+
|
|
274
|
+
*Metaph = '\0';
|
|
275
|
+
free(ntrans);
|
|
276
|
+
return strlen(metaph_start);
|
|
277
|
+
}
|
|
278
|
+
|