rmmseg-cpp-new 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +3 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +253 -0
- data/ext/rmmseg/algor.h +79 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +13 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg-cpp-new.rb +2 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- metadata +68 -0
@@ -0,0 +1,253 @@
|
|
1
|
+
#include <cctype>
|
2
|
+
#include <cassert>
|
3
|
+
|
4
|
+
#include "rules.h"
|
5
|
+
#include "algor.h"
|
6
|
+
|
7
|
+
using namespace std;
|
8
|
+
|
9
|
+
namespace rmmseg
|
10
|
+
{
|
11
|
+
Token Algorithm::next_token()
|
12
|
+
{
|
13
|
+
do
|
14
|
+
{
|
15
|
+
if (m_pos >= m_text_length)
|
16
|
+
return Token(NULL, 0);
|
17
|
+
|
18
|
+
Token tk(NULL, 0);
|
19
|
+
int len = next_char();
|
20
|
+
if (len == 1)
|
21
|
+
tk = get_basic_latin_word();
|
22
|
+
else
|
23
|
+
tk = get_cjk_word(len);
|
24
|
+
if (tk.length > 0)
|
25
|
+
return tk;
|
26
|
+
}
|
27
|
+
while (true);
|
28
|
+
}
|
29
|
+
|
30
|
+
Token Algorithm::get_basic_latin_word()
|
31
|
+
{
|
32
|
+
int len = 1;
|
33
|
+
int start, end;
|
34
|
+
|
35
|
+
// Skip pre-word whitespaces and punctuations
|
36
|
+
while (m_pos < m_text_length)
|
37
|
+
{
|
38
|
+
if (len > 1)
|
39
|
+
break;
|
40
|
+
if (isalnum(m_text[m_pos]))
|
41
|
+
break;
|
42
|
+
m_pos++;
|
43
|
+
len = next_char();
|
44
|
+
}
|
45
|
+
|
46
|
+
start = m_pos;
|
47
|
+
while (m_pos < m_text_length)
|
48
|
+
{
|
49
|
+
if (len > 1)
|
50
|
+
break;
|
51
|
+
if (!isalnum(m_text[m_pos]))
|
52
|
+
break;
|
53
|
+
m_pos++;
|
54
|
+
len = next_char();
|
55
|
+
}
|
56
|
+
end = m_pos;
|
57
|
+
|
58
|
+
// Skip post-word whitespaces and punctuations
|
59
|
+
while (m_pos < m_text_length)
|
60
|
+
{
|
61
|
+
if (len > 1)
|
62
|
+
break;
|
63
|
+
if (isalnum(m_text[m_pos]))
|
64
|
+
break;
|
65
|
+
m_pos++;
|
66
|
+
len = next_char();
|
67
|
+
}
|
68
|
+
|
69
|
+
return Token(m_text+start, end-start);
|
70
|
+
}
|
71
|
+
|
72
|
+
Token Algorithm::get_cjk_word(int len)
|
73
|
+
{
|
74
|
+
vector<Chunk> chunks = create_chunks();
|
75
|
+
|
76
|
+
if (chunks.size() > 1)
|
77
|
+
mm_filter(chunks);
|
78
|
+
if (chunks.size() > 1)
|
79
|
+
lawl_filter(chunks);
|
80
|
+
if (chunks.size() > 1)
|
81
|
+
svwl_filter(chunks);
|
82
|
+
if (chunks.size() > 1)
|
83
|
+
lsdmfocw_filter(chunks);
|
84
|
+
|
85
|
+
if (chunks.size() < 1)
|
86
|
+
return Token(NULL, 0);
|
87
|
+
|
88
|
+
Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
|
89
|
+
m_pos += chunks[0].words[0]->nbytes;
|
90
|
+
return token;
|
91
|
+
}
|
92
|
+
|
93
|
+
vector<Chunk> Algorithm::create_chunks()
|
94
|
+
{
|
95
|
+
vector<Chunk> chunks;
|
96
|
+
Chunk chunk;
|
97
|
+
Word *w1, *w2, *w3;
|
98
|
+
|
99
|
+
int orig_pos = m_pos;
|
100
|
+
typedef vector<Word *> vec_t;
|
101
|
+
typedef vec_t::iterator it_t;
|
102
|
+
|
103
|
+
vec_t words1 = find_match_words();
|
104
|
+
for (it_t i1 = words1.begin();
|
105
|
+
i1 != words1.end();
|
106
|
+
++i1)
|
107
|
+
{
|
108
|
+
w1 = *i1;
|
109
|
+
chunk.words[0] = w1;
|
110
|
+
m_pos += w1->nbytes;
|
111
|
+
if (m_pos < m_text_length)
|
112
|
+
{
|
113
|
+
vec_t words2 = find_match_words();
|
114
|
+
for (it_t i2 = words2.begin();
|
115
|
+
i2 != words2.end();
|
116
|
+
++i2)
|
117
|
+
{
|
118
|
+
w2 = *i2;
|
119
|
+
chunk.words[1] = w2;
|
120
|
+
m_pos += w2->nbytes;
|
121
|
+
if (m_pos < m_text_length)
|
122
|
+
{
|
123
|
+
vec_t words3 = find_match_words();
|
124
|
+
for (it_t i3 = words3.begin();
|
125
|
+
i3 != words3.end();
|
126
|
+
++i3)
|
127
|
+
{
|
128
|
+
w3 = *i3;
|
129
|
+
if (w3->length == -1) // tmp word
|
130
|
+
{
|
131
|
+
chunk.n = 2;
|
132
|
+
}
|
133
|
+
else
|
134
|
+
{
|
135
|
+
chunk.n = 3;
|
136
|
+
chunk.words[2] = w3;
|
137
|
+
}
|
138
|
+
chunks.push_back(chunk);
|
139
|
+
}
|
140
|
+
}
|
141
|
+
else if (m_pos == m_text_length)
|
142
|
+
{
|
143
|
+
chunk.n = 2;
|
144
|
+
chunks.push_back(chunk);
|
145
|
+
}
|
146
|
+
m_pos -= w2->nbytes;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
else if (m_pos == m_text_length)
|
150
|
+
{
|
151
|
+
chunk.n = 1;
|
152
|
+
chunks.push_back(chunk);
|
153
|
+
}
|
154
|
+
m_pos -= w1->nbytes;
|
155
|
+
}
|
156
|
+
|
157
|
+
m_pos = orig_pos;
|
158
|
+
return chunks;
|
159
|
+
}
|
160
|
+
|
161
|
+
int Algorithm::next_char()
|
162
|
+
{
|
163
|
+
// ONLY for UTF-8
|
164
|
+
unsigned char ch = m_text[m_pos];
|
165
|
+
|
166
|
+
if ( ch>>7 == 0 )
|
167
|
+
return 1;
|
168
|
+
if ( ch>>5 == 0x6 )
|
169
|
+
return 2;
|
170
|
+
else if (ch>> 4 == 0xe)
|
171
|
+
return 3;
|
172
|
+
else if (ch>> 3 == 0x1e )
|
173
|
+
return 4;
|
174
|
+
else
|
175
|
+
return -1;
|
176
|
+
// if (ch >= 0xC0 && ch <= 0xDF)
|
177
|
+
// {
|
178
|
+
// return 1;
|
179
|
+
// }
|
180
|
+
// if (ch >= 0x60 && ch <= 0xEF)
|
181
|
+
// {
|
182
|
+
// return min(2, m_text_length-m_pos);
|
183
|
+
// }
|
184
|
+
// if (ch >= 0xE0 && ch <= 0xEF)
|
185
|
+
// {
|
186
|
+
// return min(3, m_text_length-m_pos);
|
187
|
+
// }
|
188
|
+
// return 1;
|
189
|
+
}
|
190
|
+
|
191
|
+
// int Algorithm::next_char(const char * p)
|
192
|
+
// {
|
193
|
+
// uint8 one = (unsigned char)(*p);
|
194
|
+
|
195
|
+
// if ( one>>7 == 0 )
|
196
|
+
// return 1;
|
197
|
+
// if ( one>>5 == 0x6 )
|
198
|
+
// return 2;
|
199
|
+
// else if (one >> 4 == 0xe)
|
200
|
+
// return 3;
|
201
|
+
// else if (one >> 3 == 0x1e )
|
202
|
+
// return 4;
|
203
|
+
// else
|
204
|
+
// return -1;
|
205
|
+
// }
|
206
|
+
|
207
|
+
vector<Word *> Algorithm::find_match_words()
|
208
|
+
{
|
209
|
+
for (int i = 0; i < match_cache_size; ++i)
|
210
|
+
if (m_match_cache[i].first == m_pos)
|
211
|
+
return m_match_cache[i].second;
|
212
|
+
|
213
|
+
vector<Word *> words;
|
214
|
+
Word *word;
|
215
|
+
int orig_pos = m_pos;
|
216
|
+
int n = 0, len;
|
217
|
+
|
218
|
+
while (m_pos < m_text_length)
|
219
|
+
{
|
220
|
+
if (n >= max_word_length())
|
221
|
+
break;
|
222
|
+
len = next_char();
|
223
|
+
if (len <= 1)
|
224
|
+
break;
|
225
|
+
|
226
|
+
m_pos += len;
|
227
|
+
n++;
|
228
|
+
|
229
|
+
word = dict::get(m_text+orig_pos, m_pos-orig_pos);
|
230
|
+
if (word)
|
231
|
+
words.push_back(word);
|
232
|
+
}
|
233
|
+
|
234
|
+
m_pos = orig_pos;
|
235
|
+
|
236
|
+
if (words.empty())
|
237
|
+
{
|
238
|
+
word = get_tmp_word();
|
239
|
+
word->nbytes = next_char();
|
240
|
+
word->length = -1;
|
241
|
+
strncpy(word->text, m_text+m_pos, word->nbytes);
|
242
|
+
word->text[word->nbytes] = '\0';
|
243
|
+
words.push_back(word);
|
244
|
+
}
|
245
|
+
|
246
|
+
m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
|
247
|
+
m_match_cache_i++;
|
248
|
+
if (m_match_cache_i >= match_cache_size)
|
249
|
+
m_match_cache_i = 0;
|
250
|
+
|
251
|
+
return words;
|
252
|
+
}
|
253
|
+
}
|
data/ext/rmmseg/algor.h
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#ifndef _ALGORITHM_H_
|
2
|
+
#define _ALGORITHM_H_
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
#include "chunk.h"
|
7
|
+
#include "token.h"
|
8
|
+
#include "dict.h"
|
9
|
+
|
10
|
+
/**
|
11
|
+
* The Algorithm of MMSeg use four rules:
|
12
|
+
* - Maximum matching rule
|
13
|
+
* - Largest average word length rule
|
14
|
+
* - Smallest variance of word length rule
|
15
|
+
* - Largest sum of degree of morphemic freedom of one-character
|
16
|
+
* words rule
|
17
|
+
*/
|
18
|
+
|
19
|
+
namespace rmmseg
|
20
|
+
{
|
21
|
+
class Algorithm
|
22
|
+
{
|
23
|
+
public:
|
24
|
+
Algorithm(const char *text, int length)
|
25
|
+
:m_text(text), m_pos(0),
|
26
|
+
m_text_length(length),
|
27
|
+
m_tmp_words_i(0),
|
28
|
+
m_match_cache_i(0)
|
29
|
+
{
|
30
|
+
for (int i = 0; i < match_cache_size; ++i)
|
31
|
+
m_match_cache[i].first = -1;
|
32
|
+
}
|
33
|
+
|
34
|
+
Token next_token();
|
35
|
+
|
36
|
+
const char *get_text() const
|
37
|
+
{
|
38
|
+
return m_text;
|
39
|
+
}
|
40
|
+
|
41
|
+
private:
|
42
|
+
Token get_basic_latin_word();
|
43
|
+
Token get_cjk_word(int);
|
44
|
+
|
45
|
+
std::vector<Chunk> create_chunks();
|
46
|
+
int next_word();
|
47
|
+
int next_char();
|
48
|
+
std::vector<Word *> find_match_words();
|
49
|
+
int max_word_length() { return 20; }
|
50
|
+
|
51
|
+
const char *m_text;
|
52
|
+
int m_pos;
|
53
|
+
int m_text_length;
|
54
|
+
|
55
|
+
/* tmp words are only for 1-char words which
|
56
|
+
* are not exist in the dictionary. It's length
|
57
|
+
* value will be set to -1 to indicate it is
|
58
|
+
* a tmp word. */
|
59
|
+
Word *get_tmp_word()
|
60
|
+
{
|
61
|
+
if (m_tmp_words_i >= max_tmp_words)
|
62
|
+
m_tmp_words_i = 0; // round wrap
|
63
|
+
return &m_tmp_words[m_tmp_words_i++];
|
64
|
+
}
|
65
|
+
|
66
|
+
/* related to max_word_length and match_cache_size */
|
67
|
+
static const int max_tmp_words = 512;
|
68
|
+
Word m_tmp_words[max_tmp_words];
|
69
|
+
int m_tmp_words_i;
|
70
|
+
|
71
|
+
/* match word caches */
|
72
|
+
static const int match_cache_size = 3;
|
73
|
+
typedef std::pair<int, std::vector<Word *> > match_cache_t;
|
74
|
+
match_cache_t m_match_cache[match_cache_size];
|
75
|
+
int m_match_cache_i;
|
76
|
+
};
|
77
|
+
}
|
78
|
+
|
79
|
+
#endif /* _ALGORITHM_H_ */
|
data/ext/rmmseg/chunk.h
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#ifndef _CHUNK_H_
|
2
|
+
#define _CHUNK_H_
|
3
|
+
|
4
|
+
#include <cmath>
|
5
|
+
|
6
|
+
#include "word.h"
|
7
|
+
|
8
|
+
namespace rmmseg
|
9
|
+
{
|
10
|
+
/**
|
11
|
+
* A chunk stores 3 (or less) successive words.
|
12
|
+
*/
|
13
|
+
struct Chunk
|
14
|
+
{
|
15
|
+
int total_length() const
|
16
|
+
{
|
17
|
+
int len = 0;
|
18
|
+
for (int i = 0; i < n; ++i)
|
19
|
+
len += std::abs(words[i]->length);
|
20
|
+
//if (words[i]->length == -1) /* tmp word */
|
21
|
+
// len += 1;
|
22
|
+
//else
|
23
|
+
// len += words[i]->length;
|
24
|
+
return len;
|
25
|
+
}
|
26
|
+
double average_length() const
|
27
|
+
{
|
28
|
+
return ((double)total_length())/n;
|
29
|
+
}
|
30
|
+
double variance() const
|
31
|
+
{
|
32
|
+
double avg = average_length();
|
33
|
+
double sqr_sum = 0;
|
34
|
+
double tmp;
|
35
|
+
for (int i = 0; i < n; ++i)
|
36
|
+
{
|
37
|
+
tmp = std::abs(words[i]->length);
|
38
|
+
//if (tmp == -1)
|
39
|
+
// tmp = 1;
|
40
|
+
tmp = tmp-avg;
|
41
|
+
sqr_sum += tmp*tmp;
|
42
|
+
}
|
43
|
+
return std::sqrt(sqr_sum);
|
44
|
+
}
|
45
|
+
int degree_of_morphemic_freedom() const
|
46
|
+
{
|
47
|
+
int sum = 0;
|
48
|
+
for (int i = 0; i < n; ++i)
|
49
|
+
sum += words[i]->freq;
|
50
|
+
return sum;
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
int n;
|
55
|
+
Word *words[3];
|
56
|
+
};
|
57
|
+
}
|
58
|
+
|
59
|
+
#endif /* _CHUNK_H_ */
|
data/ext/rmmseg/dict.cpp
ADDED
@@ -0,0 +1,230 @@
|
|
1
|
+
#include <cstdio>
|
2
|
+
|
3
|
+
#include "dict.h"
|
4
|
+
|
5
|
+
using namespace std;
|
6
|
+
|
7
|
+
namespace rmmseg
|
8
|
+
{
|
9
|
+
struct Entry
|
10
|
+
{
|
11
|
+
Word *word;
|
12
|
+
Entry *next;
|
13
|
+
};
|
14
|
+
|
15
|
+
const size_t init_size = 262147;
|
16
|
+
const size_t max_density = 5;
|
17
|
+
/*
|
18
|
+
Table of prime numbers 2^n+a, 2<=n<=30.
|
19
|
+
*/
|
20
|
+
static size_t primes[] = {
|
21
|
+
524288 + 21,
|
22
|
+
1048576 + 7,
|
23
|
+
2097152 + 17,
|
24
|
+
4194304 + 15,
|
25
|
+
8388608 + 9,
|
26
|
+
16777216 + 43,
|
27
|
+
33554432 + 35,
|
28
|
+
67108864 + 15,
|
29
|
+
134217728 + 29,
|
30
|
+
268435456 + 3,
|
31
|
+
536870912 + 11,
|
32
|
+
1073741824 + 85,
|
33
|
+
};
|
34
|
+
|
35
|
+
|
36
|
+
static size_t n_bins = init_size;
|
37
|
+
static size_t n_entries = 0;
|
38
|
+
static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
|
39
|
+
sizeof(Entry *)));
|
40
|
+
|
41
|
+
static size_t new_size()
|
42
|
+
{
|
43
|
+
for (size_t i = 0;
|
44
|
+
i < sizeof(primes)/sizeof(primes[0]);
|
45
|
+
++i)
|
46
|
+
{
|
47
|
+
if (primes[i] > n_bins)
|
48
|
+
{
|
49
|
+
return primes[i];
|
50
|
+
}
|
51
|
+
}
|
52
|
+
// TODO: raise exception here
|
53
|
+
return n_bins;
|
54
|
+
}
|
55
|
+
|
56
|
+
static unsigned int hash(const char *str, int len)
|
57
|
+
{
|
58
|
+
unsigned int key = 0;
|
59
|
+
while (len--)
|
60
|
+
{
|
61
|
+
key += *str++;
|
62
|
+
key += (key << 10);
|
63
|
+
key ^= (key >> 6);
|
64
|
+
}
|
65
|
+
key += (key << 3);
|
66
|
+
key ^= (key >> 11);
|
67
|
+
key += (key << 15);
|
68
|
+
return key;
|
69
|
+
}
|
70
|
+
|
71
|
+
static void rehash()
|
72
|
+
{
|
73
|
+
int new_n_bins = new_size();
|
74
|
+
Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
|
75
|
+
sizeof(Entry *)));
|
76
|
+
Entry *entry, *next;
|
77
|
+
unsigned int hash_val;
|
78
|
+
|
79
|
+
for (size_t i = 0; i < n_bins; ++i)
|
80
|
+
{
|
81
|
+
entry = bins[i];
|
82
|
+
while (entry)
|
83
|
+
{
|
84
|
+
next = entry->next;
|
85
|
+
hash_val = hash(entry->word->text,
|
86
|
+
entry->word->nbytes) % new_n_bins;
|
87
|
+
entry->next = new_bins[hash_val];
|
88
|
+
new_bins[hash_val] = entry;
|
89
|
+
entry = next;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
free(bins);
|
93
|
+
n_bins = new_n_bins;
|
94
|
+
bins = new_bins;
|
95
|
+
}
|
96
|
+
|
97
|
+
namespace dict
|
98
|
+
{
|
99
|
+
|
100
|
+
/**
|
101
|
+
* str: the base of the string
|
102
|
+
* len: length of the string (in bytes)
|
103
|
+
*
|
104
|
+
* str may be a substring of a big chunk of text thus not nul-terminated,
|
105
|
+
* so len is necessary here.
|
106
|
+
*/
|
107
|
+
Word *get(const char *str, int len)
|
108
|
+
{
|
109
|
+
unsigned int h = hash(str, len) % n_bins;
|
110
|
+
Entry *entry = bins[h];
|
111
|
+
if (!entry)
|
112
|
+
return NULL;
|
113
|
+
do
|
114
|
+
{
|
115
|
+
if (len == entry->word->nbytes &&
|
116
|
+
strncmp(str, entry->word->text, len) == 0)
|
117
|
+
return entry->word;
|
118
|
+
entry = entry->next;
|
119
|
+
}
|
120
|
+
while (entry);
|
121
|
+
|
122
|
+
return NULL;
|
123
|
+
}
|
124
|
+
|
125
|
+
void add(Word *word)
|
126
|
+
{
|
127
|
+
unsigned int hash_val = hash(word->text, word->nbytes);
|
128
|
+
unsigned int h = hash_val % n_bins;
|
129
|
+
Entry *entry = bins[h];
|
130
|
+
if (!entry)
|
131
|
+
{
|
132
|
+
if (n_entries/n_bins > max_density)
|
133
|
+
{
|
134
|
+
rehash();
|
135
|
+
h = hash_val % n_bins;
|
136
|
+
}
|
137
|
+
|
138
|
+
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
|
139
|
+
entry->word = word;
|
140
|
+
entry->next = NULL;
|
141
|
+
bins[h] = entry;
|
142
|
+
n_entries++;
|
143
|
+
return;
|
144
|
+
}
|
145
|
+
|
146
|
+
bool done = false;
|
147
|
+
do
|
148
|
+
{
|
149
|
+
if (word->nbytes == entry->word->nbytes &&
|
150
|
+
strncmp(word->text, entry->word->text, word->nbytes) == 0)
|
151
|
+
{
|
152
|
+
/* Overwriting. WARNING: the original Word object is
|
153
|
+
* permanently lost. This IS a memory leak, because
|
154
|
+
* the memory is allocated by pool_alloc. Instead of
|
155
|
+
* fixing this, tuning the dictionary file is a better
|
156
|
+
* idea
|
157
|
+
*/
|
158
|
+
entry->word = word;
|
159
|
+
done = true;
|
160
|
+
break;
|
161
|
+
}
|
162
|
+
entry = entry->next;
|
163
|
+
}
|
164
|
+
while (entry);
|
165
|
+
|
166
|
+
if (!done)
|
167
|
+
{
|
168
|
+
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
|
169
|
+
entry->word = word;
|
170
|
+
entry->next = bins[h];
|
171
|
+
bins[h] = entry;
|
172
|
+
n_entries++;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
bool load_chars(const char *filename)
|
177
|
+
{
|
178
|
+
FILE *fp = fopen(filename, "r");
|
179
|
+
if (!fp)
|
180
|
+
{
|
181
|
+
return false;
|
182
|
+
}
|
183
|
+
|
184
|
+
const size_t buf_len = 24;
|
185
|
+
char buf[buf_len];
|
186
|
+
char *ptr;
|
187
|
+
|
188
|
+
while(fgets(buf, buf_len, fp))
|
189
|
+
{
|
190
|
+
// NOTE: there SHOULD be a newline at the end of the file
|
191
|
+
buf[strlen(buf)-1] = '\0'; // truncate the newline
|
192
|
+
ptr = strchr(buf, ' ');
|
193
|
+
if (!ptr)
|
194
|
+
continue; // illegal input
|
195
|
+
*ptr = '\0';
|
196
|
+
add(make_word(ptr+1, 1, atoi(buf)));
|
197
|
+
}
|
198
|
+
|
199
|
+
fclose(fp);
|
200
|
+
return true;
|
201
|
+
}
|
202
|
+
|
203
|
+
bool load_words(const char *filename)
|
204
|
+
{
|
205
|
+
FILE *fp = fopen(filename, "r");
|
206
|
+
if (!fp)
|
207
|
+
{
|
208
|
+
return false;
|
209
|
+
}
|
210
|
+
|
211
|
+
const int buf_len = 48;
|
212
|
+
char buf[buf_len];
|
213
|
+
char *ptr;
|
214
|
+
|
215
|
+
while(fgets(buf, buf_len, fp))
|
216
|
+
{
|
217
|
+
// NOTE: there SHOULD be a newline at the end of the file
|
218
|
+
buf[strlen(buf)-1] = '\0'; // truncate the newline
|
219
|
+
ptr = strchr(buf, ' ');
|
220
|
+
if (!ptr)
|
221
|
+
continue; // illegal input
|
222
|
+
*ptr = '\0';
|
223
|
+
add(make_word(ptr+1, atoi(buf), 0));
|
224
|
+
}
|
225
|
+
|
226
|
+
fclose(fp);
|
227
|
+
return true;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|