svmlightcli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +85 -0
- data/README.md +49 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bin/svm_classify +23 -0
- data/bin/svm_learn +23 -0
- data/ext/svmlightcli/Makefile +95 -0
- data/ext/svmlightcli/extconf.rb +33 -0
- data/ext/svmlightcli/kernel.h +40 -0
- data/ext/svmlightcli/svm_classify.c +197 -0
- data/ext/svmlightcli/svm_common.c +985 -0
- data/ext/svmlightcli/svm_common.h +301 -0
- data/ext/svmlightcli/svm_hideo.c +1062 -0
- data/ext/svmlightcli/svm_learn.c +4147 -0
- data/ext/svmlightcli/svm_learn.h +169 -0
- data/ext/svmlightcli/svm_learn_main.c +397 -0
- data/lib/svmlightcli.rb +6 -0
- data/lib/svmlightcli/version.rb +3 -0
- data/svmlightcli.gemspec +28 -0
- metadata +113 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
/***********************************************************************/
|
2
|
+
/* */
|
3
|
+
/* svm_classify.c */
|
4
|
+
/* */
|
5
|
+
/* Classification module of Support Vector Machine. */
|
6
|
+
/* */
|
7
|
+
/* Author: Thorsten Joachims */
|
8
|
+
/* Date: 02.07.02 */
|
9
|
+
/* */
|
10
|
+
/* Copyright (c) 2002 Thorsten Joachims - All rights reserved */
|
11
|
+
/* */
|
12
|
+
/* This software is available for non-commercial use only. It must */
|
13
|
+
/* not be modified and distributed without prior permission of the */
|
14
|
+
/* author. The author is not responsible for implications from the */
|
15
|
+
/* use of this software. */
|
16
|
+
/* */
|
17
|
+
/************************************************************************/
|
18
|
+
|
19
|
+
# include "svm_common.h"
|
20
|
+
|
21
|
+
char docfile[200];
|
22
|
+
char modelfile[200];
|
23
|
+
char predictionsfile[200];
|
24
|
+
|
25
|
+
void read_input_parameters(int, char **, char *, char *, char *, long *,
|
26
|
+
long *);
|
27
|
+
void print_help(void);
|
28
|
+
|
29
|
+
|
30
|
+
int main (int argc, char* argv[])
|
31
|
+
{
|
32
|
+
DOC *doc; /* test example */
|
33
|
+
WORD *words;
|
34
|
+
long max_docs,max_words_doc,lld;
|
35
|
+
long totdoc=0,queryid,slackid;
|
36
|
+
long correct=0,incorrect=0,no_accuracy=0;
|
37
|
+
long res_a=0,res_b=0,res_c=0,res_d=0,wnum,pred_format;
|
38
|
+
long j;
|
39
|
+
double t1,runtime=0;
|
40
|
+
double dist,doc_label,costfactor;
|
41
|
+
char *line,*comment;
|
42
|
+
FILE *predfl,*docfl;
|
43
|
+
MODEL *model;
|
44
|
+
|
45
|
+
read_input_parameters(argc,argv,docfile,modelfile,predictionsfile,
|
46
|
+
&verbosity,&pred_format);
|
47
|
+
|
48
|
+
nol_ll(docfile,&max_docs,&max_words_doc,&lld); /* scan size of input file */
|
49
|
+
max_words_doc+=2;
|
50
|
+
lld+=2;
|
51
|
+
|
52
|
+
line = (char *)my_malloc(sizeof(char)*lld);
|
53
|
+
words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
|
54
|
+
|
55
|
+
model=read_model(modelfile);
|
56
|
+
|
57
|
+
if(model->kernel_parm.kernel_type == 0) { /* linear kernel */
|
58
|
+
/* compute weight vector */
|
59
|
+
add_weight_vector_to_linear_model(model);
|
60
|
+
}
|
61
|
+
|
62
|
+
if(verbosity>=2) {
|
63
|
+
printf("Classifying test examples.."); fflush(stdout);
|
64
|
+
}
|
65
|
+
|
66
|
+
if ((docfl = fopen (docfile, "r")) == NULL)
|
67
|
+
{ perror (docfile); exit (1); }
|
68
|
+
if ((predfl = fopen (predictionsfile, "w")) == NULL)
|
69
|
+
{ perror (predictionsfile); exit (1); }
|
70
|
+
|
71
|
+
while((!feof(docfl)) && fgets(line,(int)lld,docfl)) {
|
72
|
+
if(line[0] == '#') continue; /* line contains comments */
|
73
|
+
parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,&wnum,
|
74
|
+
max_words_doc,&comment);
|
75
|
+
totdoc++;
|
76
|
+
if(model->kernel_parm.kernel_type == 0) { /* linear kernel */
|
77
|
+
for(j=0;(words[j]).wnum != 0;j++) { /* Check if feature numbers */
|
78
|
+
if((words[j]).wnum>model->totwords) /* are not larger than in */
|
79
|
+
(words[j]).wnum=0; /* model. Remove feature if */
|
80
|
+
} /* necessary. */
|
81
|
+
doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0));
|
82
|
+
t1=get_runtime();
|
83
|
+
dist=classify_example_linear(model,doc);
|
84
|
+
runtime+=(get_runtime()-t1);
|
85
|
+
free_example(doc,1);
|
86
|
+
}
|
87
|
+
else { /* non-linear kernel */
|
88
|
+
doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0));
|
89
|
+
t1=get_runtime();
|
90
|
+
dist=classify_example(model,doc);
|
91
|
+
runtime+=(get_runtime()-t1);
|
92
|
+
free_example(doc,1);
|
93
|
+
}
|
94
|
+
if(dist>0) {
|
95
|
+
if(pred_format==0) { /* old weired output format */
|
96
|
+
fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist);
|
97
|
+
}
|
98
|
+
if(doc_label>0) correct++; else incorrect++;
|
99
|
+
if(doc_label>0) res_a++; else res_b++;
|
100
|
+
}
|
101
|
+
else {
|
102
|
+
if(pred_format==0) { /* old weired output format */
|
103
|
+
fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist);
|
104
|
+
}
|
105
|
+
if(doc_label<0) correct++; else incorrect++;
|
106
|
+
if(doc_label>0) res_c++; else res_d++;
|
107
|
+
}
|
108
|
+
if(pred_format==1) { /* output the value of decision function */
|
109
|
+
fprintf(predfl,"%.8g\n",dist);
|
110
|
+
}
|
111
|
+
if((int)(0.01+(doc_label*doc_label)) != 1)
|
112
|
+
{ no_accuracy=1; } /* test data is not binary labeled */
|
113
|
+
if(verbosity>=2) {
|
114
|
+
if(totdoc % 100 == 0) {
|
115
|
+
printf("%ld..",totdoc); fflush(stdout);
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
fclose(predfl);
|
120
|
+
fclose(docfl);
|
121
|
+
free(line);
|
122
|
+
free(words);
|
123
|
+
free_model(model,1);
|
124
|
+
|
125
|
+
if(verbosity>=2) {
|
126
|
+
printf("done\n");
|
127
|
+
|
128
|
+
/* Note by Gary Boone Date: 29 April 2000 */
|
129
|
+
/* o Timing is inaccurate. The timer has 0.01 second resolution. */
|
130
|
+
/* Because classification of a single vector takes less than */
|
131
|
+
/* 0.01 secs, the timer was underflowing. */
|
132
|
+
printf("Runtime (without IO) in cpu-seconds: %.2f\n",
|
133
|
+
(float)(runtime/100.0));
|
134
|
+
|
135
|
+
}
|
136
|
+
if((!no_accuracy) && (verbosity>=1)) {
|
137
|
+
printf("Accuracy on test set: %.2f%% (%ld correct, %ld incorrect, %ld total)\n",(float)(correct)*100.0/totdoc,correct,incorrect,totdoc);
|
138
|
+
printf("Precision/recall on test set: %.2f%%/%.2f%%\n",(float)(res_a)*100.0/(res_a+res_b),(float)(res_a)*100.0/(res_a+res_c));
|
139
|
+
}
|
140
|
+
|
141
|
+
return(0);
|
142
|
+
}
|
143
|
+
|
144
|
+
void read_input_parameters(int argc, char **argv, char *docfile,
|
145
|
+
char *modelfile, char *predictionsfile,
|
146
|
+
long int *verbosity, long int *pred_format)
|
147
|
+
{
|
148
|
+
long i;
|
149
|
+
|
150
|
+
/* set default */
|
151
|
+
strcpy (modelfile, "svm_model");
|
152
|
+
strcpy (predictionsfile, "svm_predictions");
|
153
|
+
(*verbosity)=2;
|
154
|
+
(*pred_format)=1;
|
155
|
+
|
156
|
+
for(i=1;(i<argc) && ((argv[i])[0] == '-');i++) {
|
157
|
+
switch ((argv[i])[1])
|
158
|
+
{
|
159
|
+
case 'h': print_help(); exit(0);
|
160
|
+
case 'v': i++; (*verbosity)=atol(argv[i]); break;
|
161
|
+
case 'f': i++; (*pred_format)=atol(argv[i]); break;
|
162
|
+
default: printf("\nUnrecognized option %s!\n\n",argv[i]);
|
163
|
+
print_help();
|
164
|
+
exit(0);
|
165
|
+
}
|
166
|
+
}
|
167
|
+
if((i+1)>=argc) {
|
168
|
+
printf("\nNot enough input parameters!\n\n");
|
169
|
+
print_help();
|
170
|
+
exit(0);
|
171
|
+
}
|
172
|
+
strcpy (docfile, argv[i]);
|
173
|
+
strcpy (modelfile, argv[i+1]);
|
174
|
+
if((i+2)<argc) {
|
175
|
+
strcpy (predictionsfile, argv[i+2]);
|
176
|
+
}
|
177
|
+
if(((*pred_format) != 0) && ((*pred_format) != 1)) {
|
178
|
+
printf("\nOutput format can only take the values 0 or 1!\n\n");
|
179
|
+
print_help();
|
180
|
+
exit(0);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
void print_help(void)
|
185
|
+
{
|
186
|
+
printf("\nSVM-light %s: Support Vector Machine, classification module %s\n",VERSION,VERSION_DATE);
|
187
|
+
copyright_notice();
|
188
|
+
printf(" usage: svm_classify [options] example_file model_file output_file\n\n");
|
189
|
+
printf("options: -h -> this help\n");
|
190
|
+
printf(" -v [0..3] -> verbosity level (default 2)\n");
|
191
|
+
printf(" -f [0,1] -> 0: old output format of V1.0\n");
|
192
|
+
printf(" -> 1: output the value of decision function (default)\n\n");
|
193
|
+
}
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
|
@@ -0,0 +1,985 @@
|
|
1
|
+
/************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* svm_common.c */
|
4
|
+
/* */
|
5
|
+
/* Definitions and functions used in both svm_learn and svm_classify. */
|
6
|
+
/* */
|
7
|
+
/* Author: Thorsten Joachims */
|
8
|
+
/* Date: 02.07.04 */
|
9
|
+
/* */
|
10
|
+
/* Copyright (c) 2004 Thorsten Joachims - All rights reserved */
|
11
|
+
/* */
|
12
|
+
/* This software is available for non-commercial use only. It must */
|
13
|
+
/* not be modified and distributed without prior permission of the */
|
14
|
+
/* author. The author is not responsible for implications from the */
|
15
|
+
/* use of this software. */
|
16
|
+
/* */
|
17
|
+
/************************************************************************/
|
18
|
+
|
19
|
+
# include "ctype.h"
|
20
|
+
# include "svm_common.h"
|
21
|
+
# include "kernel.h" /* this contains a user supplied kernel */
|
22
|
+
|
23
|
+
long verbosity; /* verbosity level (0-4) */
|
24
|
+
long kernel_cache_statistic;
|
25
|
+
|
26
|
+
double classify_example(MODEL *model, DOC *ex)
|
27
|
+
/* classifies one example */
|
28
|
+
{
|
29
|
+
register long i;
|
30
|
+
register double dist;
|
31
|
+
|
32
|
+
if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
|
33
|
+
return(classify_example_linear(model,ex));
|
34
|
+
|
35
|
+
dist=0;
|
36
|
+
for(i=1;i<model->sv_num;i++) {
|
37
|
+
dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
|
38
|
+
}
|
39
|
+
return(dist-model->b);
|
40
|
+
}
|
41
|
+
|
42
|
+
double classify_example_linear(MODEL *model, DOC *ex)
|
43
|
+
/* classifies example for linear kernel */
|
44
|
+
|
45
|
+
/* important: the model must have the linear weight vector computed */
|
46
|
+
/* use: add_weight_vector_to_linear_model(&model); */
|
47
|
+
|
48
|
+
|
49
|
+
/* important: the feature numbers in the example to classify must */
|
50
|
+
/* not be larger than the weight vector! */
|
51
|
+
{
|
52
|
+
double sum=0;
|
53
|
+
SVECTOR *f;
|
54
|
+
|
55
|
+
for(f=ex->fvec;f;f=f->next)
|
56
|
+
sum+=f->factor*sprod_ns(model->lin_weights,f);
|
57
|
+
return(sum-model->b);
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
double kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
|
62
|
+
/* calculate the kernel function */
|
63
|
+
{
|
64
|
+
double sum=0;
|
65
|
+
SVECTOR *fa,*fb;
|
66
|
+
|
67
|
+
/* in case the constraints are sums of feature vector as represented
|
68
|
+
as a list of SVECTOR's with their coefficient factor in the sum,
|
69
|
+
take the kernel between all pairs */
|
70
|
+
for(fa=a->fvec;fa;fa=fa->next) {
|
71
|
+
for(fb=b->fvec;fb;fb=fb->next) {
|
72
|
+
if(fa->kernel_id == fb->kernel_id)
|
73
|
+
sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
return(sum);
|
77
|
+
}
|
78
|
+
|
79
|
+
double single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b)
|
80
|
+
/* calculate the kernel function between two vectors */
|
81
|
+
{
|
82
|
+
kernel_cache_statistic++;
|
83
|
+
switch(kernel_parm->kernel_type) {
|
84
|
+
case 0: /* linear */
|
85
|
+
return(sprod_ss(a,b));
|
86
|
+
case 1: /* polynomial */
|
87
|
+
return(pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
|
88
|
+
case 2: /* radial basis function */
|
89
|
+
return(exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
|
90
|
+
case 3: /* sigmoid neural net */
|
91
|
+
return(tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const));
|
92
|
+
case 4: /* custom-kernel supplied in file kernel.h*/
|
93
|
+
return(custom_kernel(kernel_parm,a,b));
|
94
|
+
default: printf("Error: Unknown kernel function\n"); exit(1);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
|
100
|
+
{
|
101
|
+
SVECTOR *vec;
|
102
|
+
long fnum,i;
|
103
|
+
|
104
|
+
fnum=0;
|
105
|
+
while(words[fnum].wnum) {
|
106
|
+
fnum++;
|
107
|
+
}
|
108
|
+
fnum++;
|
109
|
+
vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
|
110
|
+
vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
|
111
|
+
for(i=0;i<fnum;i++) {
|
112
|
+
vec->words[i]=words[i];
|
113
|
+
}
|
114
|
+
vec->twonorm_sq=sprod_ss(vec,vec);
|
115
|
+
|
116
|
+
fnum=0;
|
117
|
+
while(userdefined[fnum]) {
|
118
|
+
fnum++;
|
119
|
+
}
|
120
|
+
fnum++;
|
121
|
+
vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
|
122
|
+
for(i=0;i<fnum;i++) {
|
123
|
+
vec->userdefined[i]=userdefined[i];
|
124
|
+
}
|
125
|
+
vec->kernel_id=0;
|
126
|
+
vec->next=NULL;
|
127
|
+
vec->factor=factor;
|
128
|
+
return(vec);
|
129
|
+
}
|
130
|
+
|
131
|
+
SVECTOR *copy_svector(SVECTOR *vec)
|
132
|
+
{
|
133
|
+
SVECTOR *newvec=NULL;
|
134
|
+
if(vec) {
|
135
|
+
newvec=create_svector(vec->words,vec->userdefined,vec->factor);
|
136
|
+
newvec->next=copy_svector(vec->next);
|
137
|
+
}
|
138
|
+
return(newvec);
|
139
|
+
}
|
140
|
+
|
141
|
+
void free_svector(SVECTOR *vec)
|
142
|
+
{
|
143
|
+
if(vec) {
|
144
|
+
free(vec->words);
|
145
|
+
if(vec->userdefined)
|
146
|
+
free(vec->userdefined);
|
147
|
+
free_svector(vec->next);
|
148
|
+
free(vec);
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
double sprod_ss(SVECTOR *a, SVECTOR *b)
|
153
|
+
/* compute the inner product of two sparse vectors */
|
154
|
+
{
|
155
|
+
register double sum=0;
|
156
|
+
register WORD *ai,*bj;
|
157
|
+
ai=a->words;
|
158
|
+
bj=b->words;
|
159
|
+
while (ai->wnum && bj->wnum) {
|
160
|
+
if(ai->wnum > bj->wnum) {
|
161
|
+
bj++;
|
162
|
+
}
|
163
|
+
else if (ai->wnum < bj->wnum) {
|
164
|
+
ai++;
|
165
|
+
}
|
166
|
+
else {
|
167
|
+
sum+=(ai->weight) * (bj->weight);
|
168
|
+
ai++;
|
169
|
+
bj++;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
return((double)sum);
|
173
|
+
}
|
174
|
+
|
175
|
+
SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b)
|
176
|
+
/* compute the difference a-b of two sparse vectors */
|
177
|
+
/* Note: SVECTOR lists are not followed, but only the first
|
178
|
+
SVECTOR is used */
|
179
|
+
{
|
180
|
+
SVECTOR *vec;
|
181
|
+
register WORD *sum,*sumi;
|
182
|
+
register WORD *ai,*bj;
|
183
|
+
long veclength;
|
184
|
+
|
185
|
+
ai=a->words;
|
186
|
+
bj=b->words;
|
187
|
+
veclength=0;
|
188
|
+
while (ai->wnum && bj->wnum) {
|
189
|
+
if(ai->wnum > bj->wnum) {
|
190
|
+
veclength++;
|
191
|
+
bj++;
|
192
|
+
}
|
193
|
+
else if (ai->wnum < bj->wnum) {
|
194
|
+
veclength++;
|
195
|
+
ai++;
|
196
|
+
}
|
197
|
+
else {
|
198
|
+
veclength++;
|
199
|
+
ai++;
|
200
|
+
bj++;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
while (bj->wnum) {
|
204
|
+
veclength++;
|
205
|
+
bj++;
|
206
|
+
}
|
207
|
+
while (ai->wnum) {
|
208
|
+
veclength++;
|
209
|
+
ai++;
|
210
|
+
}
|
211
|
+
veclength++;
|
212
|
+
|
213
|
+
sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
214
|
+
sumi=sum;
|
215
|
+
ai=a->words;
|
216
|
+
bj=b->words;
|
217
|
+
while (ai->wnum && bj->wnum) {
|
218
|
+
if(ai->wnum > bj->wnum) {
|
219
|
+
(*sumi)=(*bj);
|
220
|
+
sumi->weight*=(-1);
|
221
|
+
sumi++;
|
222
|
+
bj++;
|
223
|
+
}
|
224
|
+
else if (ai->wnum < bj->wnum) {
|
225
|
+
(*sumi)=(*ai);
|
226
|
+
sumi++;
|
227
|
+
ai++;
|
228
|
+
}
|
229
|
+
else {
|
230
|
+
(*sumi)=(*ai);
|
231
|
+
sumi->weight-=bj->weight;
|
232
|
+
if(sumi->weight != 0)
|
233
|
+
sumi++;
|
234
|
+
ai++;
|
235
|
+
bj++;
|
236
|
+
}
|
237
|
+
}
|
238
|
+
while (bj->wnum) {
|
239
|
+
(*sumi)=(*bj);
|
240
|
+
sumi->weight*=(-1);
|
241
|
+
sumi++;
|
242
|
+
bj++;
|
243
|
+
}
|
244
|
+
while (ai->wnum) {
|
245
|
+
(*sumi)=(*ai);
|
246
|
+
sumi++;
|
247
|
+
ai++;
|
248
|
+
}
|
249
|
+
sumi->wnum=0;
|
250
|
+
|
251
|
+
vec=create_svector(sum,"",1.0);
|
252
|
+
free(sum);
|
253
|
+
|
254
|
+
return(vec);
|
255
|
+
}
|
256
|
+
|
257
|
+
SVECTOR* add_ss(SVECTOR *a, SVECTOR *b)
|
258
|
+
/* compute the sum a+b of two sparse vectors */
|
259
|
+
/* Note: SVECTOR lists are not followed, but only the first
|
260
|
+
SVECTOR is used */
|
261
|
+
{
|
262
|
+
SVECTOR *vec;
|
263
|
+
register WORD *sum,*sumi;
|
264
|
+
register WORD *ai,*bj;
|
265
|
+
long veclength;
|
266
|
+
|
267
|
+
ai=a->words;
|
268
|
+
bj=b->words;
|
269
|
+
veclength=0;
|
270
|
+
while (ai->wnum && bj->wnum) {
|
271
|
+
if(ai->wnum > bj->wnum) {
|
272
|
+
veclength++;
|
273
|
+
bj++;
|
274
|
+
}
|
275
|
+
else if (ai->wnum < bj->wnum) {
|
276
|
+
veclength++;
|
277
|
+
ai++;
|
278
|
+
}
|
279
|
+
else {
|
280
|
+
veclength++;
|
281
|
+
ai++;
|
282
|
+
bj++;
|
283
|
+
}
|
284
|
+
}
|
285
|
+
while (bj->wnum) {
|
286
|
+
veclength++;
|
287
|
+
bj++;
|
288
|
+
}
|
289
|
+
while (ai->wnum) {
|
290
|
+
veclength++;
|
291
|
+
ai++;
|
292
|
+
}
|
293
|
+
veclength++;
|
294
|
+
|
295
|
+
/*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
|
296
|
+
|
297
|
+
sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
298
|
+
sumi=sum;
|
299
|
+
ai=a->words;
|
300
|
+
bj=b->words;
|
301
|
+
while (ai->wnum && bj->wnum) {
|
302
|
+
if(ai->wnum > bj->wnum) {
|
303
|
+
(*sumi)=(*bj);
|
304
|
+
sumi++;
|
305
|
+
bj++;
|
306
|
+
}
|
307
|
+
else if (ai->wnum < bj->wnum) {
|
308
|
+
(*sumi)=(*ai);
|
309
|
+
sumi++;
|
310
|
+
ai++;
|
311
|
+
}
|
312
|
+
else {
|
313
|
+
(*sumi)=(*ai);
|
314
|
+
sumi->weight+=bj->weight;
|
315
|
+
if(sumi->weight != 0)
|
316
|
+
sumi++;
|
317
|
+
ai++;
|
318
|
+
bj++;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
while (bj->wnum) {
|
322
|
+
(*sumi)=(*bj);
|
323
|
+
sumi++;
|
324
|
+
bj++;
|
325
|
+
}
|
326
|
+
while (ai->wnum) {
|
327
|
+
(*sumi)=(*ai);
|
328
|
+
sumi++;
|
329
|
+
ai++;
|
330
|
+
}
|
331
|
+
sumi->wnum=0;
|
332
|
+
|
333
|
+
vec=create_svector(sum,"",1.0);
|
334
|
+
free(sum);
|
335
|
+
|
336
|
+
return(vec);
|
337
|
+
}
|
338
|
+
|
339
|
+
SVECTOR* add_list_ss(SVECTOR *a)
|
340
|
+
/* computes the linear combination of the SVECTOR list weighted
|
341
|
+
by the factor of each SVECTOR */
|
342
|
+
{
|
343
|
+
SVECTOR *scaled,*oldsum,*sum,*f;
|
344
|
+
WORD empty[2];
|
345
|
+
|
346
|
+
if(a){
|
347
|
+
sum=smult_s(a,a->factor);
|
348
|
+
for(f=a->next;f;f=f->next) {
|
349
|
+
scaled=smult_s(f,f->factor);
|
350
|
+
oldsum=sum;
|
351
|
+
sum=add_ss(sum,scaled);
|
352
|
+
free_svector(oldsum);
|
353
|
+
free_svector(scaled);
|
354
|
+
}
|
355
|
+
sum->factor=1.0;
|
356
|
+
}
|
357
|
+
else {
|
358
|
+
empty[0].wnum=0;
|
359
|
+
sum=create_svector(empty,"",1.0);
|
360
|
+
}
|
361
|
+
return(sum);
|
362
|
+
}
|
363
|
+
|
364
|
+
void append_svector_list(SVECTOR *a, SVECTOR *b)
|
365
|
+
/* appends SVECTOR b to the end of SVECTOR a. */
|
366
|
+
{
|
367
|
+
SVECTOR *f;
|
368
|
+
|
369
|
+
for(f=a;f->next;f=f->next); /* find end of first vector list */
|
370
|
+
f->next=b; /* append the two vector lists */
|
371
|
+
}
|
372
|
+
|
373
|
+
SVECTOR* smult_s(SVECTOR *a, double factor)
|
374
|
+
/* scale sparse vector a by factor */
|
375
|
+
{
|
376
|
+
SVECTOR *vec;
|
377
|
+
register WORD *sum,*sumi;
|
378
|
+
register WORD *ai;
|
379
|
+
long veclength;
|
380
|
+
|
381
|
+
ai=a->words;
|
382
|
+
veclength=0;
|
383
|
+
while (ai->wnum) {
|
384
|
+
veclength++;
|
385
|
+
ai++;
|
386
|
+
}
|
387
|
+
veclength++;
|
388
|
+
|
389
|
+
sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
390
|
+
sumi=sum;
|
391
|
+
ai=a->words;
|
392
|
+
while (ai->wnum) {
|
393
|
+
(*sumi)=(*ai);
|
394
|
+
sumi->weight*=factor;
|
395
|
+
if(sumi->weight != 0)
|
396
|
+
sumi++;
|
397
|
+
ai++;
|
398
|
+
}
|
399
|
+
sumi->wnum=0;
|
400
|
+
|
401
|
+
vec=create_svector(sum,a->userdefined,a->factor);
|
402
|
+
free(sum);
|
403
|
+
|
404
|
+
return(vec);
|
405
|
+
}
|
406
|
+
|
407
|
+
int featvec_eq(SVECTOR *a, SVECTOR *b)
|
408
|
+
/* tests two sparse vectors for equality */
|
409
|
+
{
|
410
|
+
register WORD *ai,*bj;
|
411
|
+
ai=a->words;
|
412
|
+
bj=b->words;
|
413
|
+
while (ai->wnum && bj->wnum) {
|
414
|
+
if(ai->wnum > bj->wnum) {
|
415
|
+
if((bj->weight) != 0)
|
416
|
+
return(0);
|
417
|
+
bj++;
|
418
|
+
}
|
419
|
+
else if (ai->wnum < bj->wnum) {
|
420
|
+
if((ai->weight) != 0)
|
421
|
+
return(0);
|
422
|
+
ai++;
|
423
|
+
}
|
424
|
+
else {
|
425
|
+
if((ai->weight) != (bj->weight))
|
426
|
+
return(0);
|
427
|
+
ai++;
|
428
|
+
bj++;
|
429
|
+
}
|
430
|
+
}
|
431
|
+
return(1);
|
432
|
+
}
|
433
|
+
|
434
|
+
double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
|
435
|
+
/* compute length of weight vector */
|
436
|
+
{
|
437
|
+
register long i,j;
|
438
|
+
register double sum=0,alphai;
|
439
|
+
register DOC *supveci;
|
440
|
+
|
441
|
+
for(i=1;i<model->sv_num;i++) {
|
442
|
+
alphai=model->alpha[i];
|
443
|
+
supveci=model->supvec[i];
|
444
|
+
for(j=1;j<model->sv_num;j++) {
|
445
|
+
sum+=alphai*model->alpha[j]
|
446
|
+
*kernel(kernel_parm,supveci,model->supvec[j]);
|
447
|
+
}
|
448
|
+
}
|
449
|
+
return(sqrt(sum));
|
450
|
+
}
|
451
|
+
|
452
|
+
void clear_vector_n(double *vec, long int n)
|
453
|
+
{
|
454
|
+
register long i;
|
455
|
+
for(i=0;i<=n;i++) vec[i]=0;
|
456
|
+
}
|
457
|
+
|
458
|
+
void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
|
459
|
+
{
|
460
|
+
register WORD *ai;
|
461
|
+
ai=vec_s->words;
|
462
|
+
while (ai->wnum) {
|
463
|
+
vec_n[ai->wnum]+=(faktor*ai->weight);
|
464
|
+
ai++;
|
465
|
+
}
|
466
|
+
}
|
467
|
+
|
468
|
+
double sprod_ns(double *vec_n, SVECTOR *vec_s)
|
469
|
+
{
|
470
|
+
register double sum=0;
|
471
|
+
register WORD *ai;
|
472
|
+
ai=vec_s->words;
|
473
|
+
while (ai->wnum) {
|
474
|
+
sum+=(vec_n[ai->wnum]*ai->weight);
|
475
|
+
ai++;
|
476
|
+
}
|
477
|
+
return(sum);
|
478
|
+
}
|
479
|
+
|
480
|
+
void add_weight_vector_to_linear_model(MODEL *model)
|
481
|
+
/* compute weight vector in linear case and add to model */
|
482
|
+
{
|
483
|
+
long i;
|
484
|
+
SVECTOR *f;
|
485
|
+
|
486
|
+
model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
|
487
|
+
clear_vector_n(model->lin_weights,model->totwords);
|
488
|
+
for(i=1;i<model->sv_num;i++) {
|
489
|
+
for(f=(model->supvec[i])->fvec;f;f=f->next)
|
490
|
+
add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
|
491
|
+
}
|
492
|
+
}
|
493
|
+
|
494
|
+
|
495
|
+
DOC *create_example(long docnum, long queryid, long slackid,
|
496
|
+
double costfactor, SVECTOR *fvec)
|
497
|
+
{
|
498
|
+
DOC *example;
|
499
|
+
example = (DOC *)my_malloc(sizeof(DOC));
|
500
|
+
example->docnum=docnum;
|
501
|
+
example->queryid=queryid;
|
502
|
+
example->slackid=slackid;
|
503
|
+
example->costfactor=costfactor;
|
504
|
+
example->fvec=fvec;
|
505
|
+
return(example);
|
506
|
+
}
|
507
|
+
|
508
|
+
void free_example(DOC *example, long deep)
|
509
|
+
{
|
510
|
+
if(example) {
|
511
|
+
if(deep) {
|
512
|
+
if(example->fvec)
|
513
|
+
free_svector(example->fvec);
|
514
|
+
}
|
515
|
+
free(example);
|
516
|
+
}
|
517
|
+
}
|
518
|
+
|
519
|
+
void write_model(char *modelfile, MODEL *model)
|
520
|
+
{
|
521
|
+
FILE *modelfl;
|
522
|
+
long j,i,sv_num;
|
523
|
+
SVECTOR *v;
|
524
|
+
|
525
|
+
if(verbosity>=1) {
|
526
|
+
printf("Writing model file..."); fflush(stdout);
|
527
|
+
}
|
528
|
+
if ((modelfl = fopen (modelfile, "w")) == NULL)
|
529
|
+
{ perror (modelfile); exit (1); }
|
530
|
+
fprintf(modelfl,"SVM-light Version %s\n",VERSION);
|
531
|
+
fprintf(modelfl,"%ld # kernel type\n",
|
532
|
+
model->kernel_parm.kernel_type);
|
533
|
+
fprintf(modelfl,"%ld # kernel parameter -d \n",
|
534
|
+
model->kernel_parm.poly_degree);
|
535
|
+
fprintf(modelfl,"%.8g # kernel parameter -g \n",
|
536
|
+
model->kernel_parm.rbf_gamma);
|
537
|
+
fprintf(modelfl,"%.8g # kernel parameter -s \n",
|
538
|
+
model->kernel_parm.coef_lin);
|
539
|
+
fprintf(modelfl,"%.8g # kernel parameter -r \n",
|
540
|
+
model->kernel_parm.coef_const);
|
541
|
+
fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
|
542
|
+
fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
|
543
|
+
fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
|
544
|
+
|
545
|
+
sv_num=1;
|
546
|
+
for(i=1;i<model->sv_num;i++) {
|
547
|
+
for(v=model->supvec[i]->fvec;v;v=v->next)
|
548
|
+
sv_num++;
|
549
|
+
}
|
550
|
+
fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
|
551
|
+
fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
|
552
|
+
|
553
|
+
for(i=1;i<model->sv_num;i++) {
|
554
|
+
for(v=model->supvec[i]->fvec;v;v=v->next) {
|
555
|
+
fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
|
556
|
+
for (j=0; (v->words[j]).wnum; j++) {
|
557
|
+
fprintf(modelfl,"%ld:%.8g ",
|
558
|
+
(long)(v->words[j]).wnum,
|
559
|
+
(double)(v->words[j]).weight);
|
560
|
+
}
|
561
|
+
fprintf(modelfl,"#%s\n",v->userdefined);
|
562
|
+
/* NOTE: this could be made more efficient by summing the
|
563
|
+
alpha's of identical vectors before writing them to the
|
564
|
+
file. */
|
565
|
+
}
|
566
|
+
}
|
567
|
+
fclose(modelfl);
|
568
|
+
if(verbosity>=1) {
|
569
|
+
printf("done\n");
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
|
574
|
+
MODEL *read_model(char *modelfile)
|
575
|
+
{
|
576
|
+
FILE *modelfl;
|
577
|
+
long i,queryid,slackid;
|
578
|
+
double costfactor;
|
579
|
+
long max_sv,max_words,ll,wpos;
|
580
|
+
char *line,*comment;
|
581
|
+
WORD *words;
|
582
|
+
char version_buffer[100];
|
583
|
+
MODEL *model;
|
584
|
+
|
585
|
+
if(verbosity>=1) {
|
586
|
+
printf("Reading model..."); fflush(stdout);
|
587
|
+
}
|
588
|
+
|
589
|
+
nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
|
590
|
+
max_words+=2;
|
591
|
+
ll+=2;
|
592
|
+
|
593
|
+
words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
|
594
|
+
line = (char *)my_malloc(sizeof(char)*ll);
|
595
|
+
model = (MODEL *)my_malloc(sizeof(MODEL));
|
596
|
+
|
597
|
+
if ((modelfl = fopen (modelfile, "r")) == NULL)
|
598
|
+
{ perror (modelfile); exit (1); }
|
599
|
+
|
600
|
+
fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
|
601
|
+
if(strcmp(version_buffer,VERSION)) {
|
602
|
+
perror ("Version of model-file does not match version of svm_classify!");
|
603
|
+
exit (1);
|
604
|
+
}
|
605
|
+
fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
|
606
|
+
fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
|
607
|
+
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
|
608
|
+
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
|
609
|
+
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
|
610
|
+
fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
|
611
|
+
|
612
|
+
fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
|
613
|
+
fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
|
614
|
+
fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
|
615
|
+
fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
|
616
|
+
|
617
|
+
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
|
618
|
+
model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
|
619
|
+
model->index=NULL;
|
620
|
+
model->lin_weights=NULL;
|
621
|
+
|
622
|
+
for(i=1;i<model->sv_num;i++) {
|
623
|
+
fgets(line,(int)ll,modelfl);
|
624
|
+
if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
|
625
|
+
&costfactor,&wpos,max_words,&comment)) {
|
626
|
+
printf("\nParsing error while reading model file in SV %ld!\n%s",
|
627
|
+
i,line);
|
628
|
+
exit(1);
|
629
|
+
}
|
630
|
+
model->supvec[i] = create_example(-1,
|
631
|
+
0,0,
|
632
|
+
0.0,
|
633
|
+
create_svector(words,comment,1.0));
|
634
|
+
}
|
635
|
+
fclose(modelfl);
|
636
|
+
free(line);
|
637
|
+
free(words);
|
638
|
+
if(verbosity>=1) {
|
639
|
+
fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
|
640
|
+
}
|
641
|
+
return(model);
|
642
|
+
}
|
643
|
+
|
644
|
+
MODEL *copy_model(MODEL *model)
|
645
|
+
{
|
646
|
+
MODEL *newmodel;
|
647
|
+
long i;
|
648
|
+
|
649
|
+
newmodel=(MODEL *)my_malloc(sizeof(MODEL));
|
650
|
+
(*newmodel)=(*model);
|
651
|
+
newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
|
652
|
+
newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
|
653
|
+
newmodel->index = NULL; /* index is not copied */
|
654
|
+
newmodel->supvec[0] = NULL;
|
655
|
+
newmodel->alpha[0] = 0;
|
656
|
+
for(i=1;i<model->sv_num;i++) {
|
657
|
+
newmodel->alpha[i]=model->alpha[i];
|
658
|
+
newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
|
659
|
+
model->supvec[i]->queryid,0,
|
660
|
+
model->supvec[i]->costfactor,
|
661
|
+
copy_svector(model->supvec[i]->fvec));
|
662
|
+
}
|
663
|
+
if(model->lin_weights) {
|
664
|
+
newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
|
665
|
+
for(i=0;i<model->totwords+1;i++)
|
666
|
+
newmodel->lin_weights[i]=model->lin_weights[i];
|
667
|
+
}
|
668
|
+
return(newmodel);
|
669
|
+
}
|
670
|
+
|
671
|
+
void free_model(MODEL *model, int deep)
|
672
|
+
{
|
673
|
+
long i;
|
674
|
+
|
675
|
+
if(model->supvec) {
|
676
|
+
if(deep) {
|
677
|
+
for(i=1;i<model->sv_num;i++) {
|
678
|
+
free_example(model->supvec[i],1);
|
679
|
+
}
|
680
|
+
}
|
681
|
+
free(model->supvec);
|
682
|
+
}
|
683
|
+
if(model->alpha) free(model->alpha);
|
684
|
+
if(model->index) free(model->index);
|
685
|
+
if(model->lin_weights) free(model->lin_weights);
|
686
|
+
free(model);
|
687
|
+
}
|
688
|
+
|
689
|
+
|
690
|
+
void read_documents(char *docfile, DOC ***docs, double **label,
|
691
|
+
long int *totwords, long int *totdoc)
|
692
|
+
{
|
693
|
+
char *line,*comment;
|
694
|
+
WORD *words;
|
695
|
+
long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
|
696
|
+
long max_words_doc, ll;
|
697
|
+
double doc_label,costfactor;
|
698
|
+
FILE *docfl;
|
699
|
+
|
700
|
+
if(verbosity>=1) {
|
701
|
+
printf("Scanning examples..."); fflush(stdout);
|
702
|
+
}
|
703
|
+
nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
|
704
|
+
max_words_doc+=2;
|
705
|
+
ll+=2;
|
706
|
+
max_docs+=2;
|
707
|
+
if(verbosity>=1) {
|
708
|
+
printf("done\n"); fflush(stdout);
|
709
|
+
}
|
710
|
+
|
711
|
+
(*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
|
712
|
+
(*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
|
713
|
+
line = (char *)my_malloc(sizeof(char)*ll);
|
714
|
+
|
715
|
+
if ((docfl = fopen (docfile, "r")) == NULL)
|
716
|
+
{ perror (docfile); exit (1); }
|
717
|
+
|
718
|
+
words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
|
719
|
+
if(verbosity>=1) {
|
720
|
+
printf("Reading examples into memory..."); fflush(stdout);
|
721
|
+
}
|
722
|
+
dnum=0;
|
723
|
+
(*totwords)=0;
|
724
|
+
while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
|
725
|
+
if(line[0] == '#') continue; /* line contains comments */
|
726
|
+
if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
|
727
|
+
&wpos,max_words_doc,&comment)) {
|
728
|
+
printf("\nParsing error in line %ld!\n%s",dnum,line);
|
729
|
+
exit(1);
|
730
|
+
}
|
731
|
+
(*label)[dnum]=doc_label;
|
732
|
+
/* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
|
733
|
+
if(doc_label > 0) dpos++;
|
734
|
+
if (doc_label < 0) dneg++;
|
735
|
+
if (doc_label == 0) dunlab++;
|
736
|
+
if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
|
737
|
+
(*totwords)=(words[wpos-2]).wnum;
|
738
|
+
if((*totwords) > MAXFEATNUM) {
|
739
|
+
printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
|
740
|
+
printf("LINE: %s\n",line);
|
741
|
+
exit(1);
|
742
|
+
}
|
743
|
+
(*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
|
744
|
+
create_svector(words,comment,1.0));
|
745
|
+
/* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
|
746
|
+
dnum++;
|
747
|
+
if(verbosity>=1) {
|
748
|
+
if((dnum % 100) == 0) {
|
749
|
+
printf("%ld..",dnum); fflush(stdout);
|
750
|
+
}
|
751
|
+
}
|
752
|
+
}
|
753
|
+
|
754
|
+
fclose(docfl);
|
755
|
+
free(line);
|
756
|
+
free(words);
|
757
|
+
if(verbosity>=1) {
|
758
|
+
fprintf(stdout, "OK. (%ld examples read)\n", dnum);
|
759
|
+
}
|
760
|
+
(*totdoc)=dnum;
|
761
|
+
}
|
762
|
+
|
763
|
+
int parse_document(char *line, WORD *words, double *label,
|
764
|
+
long *queryid, long *slackid, double *costfactor,
|
765
|
+
long int *numwords, long int max_words_doc,
|
766
|
+
char **comment)
|
767
|
+
{
|
768
|
+
register long wpos,pos;
|
769
|
+
long wnum;
|
770
|
+
double weight;
|
771
|
+
int numread;
|
772
|
+
char featurepair[1000],junk[1000];
|
773
|
+
|
774
|
+
(*queryid)=0;
|
775
|
+
(*slackid)=0;
|
776
|
+
(*costfactor)=1;
|
777
|
+
|
778
|
+
pos=0;
|
779
|
+
(*comment)=NULL;
|
780
|
+
while(line[pos] ) { /* cut off comments */
|
781
|
+
if((line[pos] == '#') && (!(*comment))) {
|
782
|
+
line[pos]=0;
|
783
|
+
(*comment)=&(line[pos+1]);
|
784
|
+
}
|
785
|
+
if(line[pos] == '\n') { /* strip the CR */
|
786
|
+
line[pos]=0;
|
787
|
+
}
|
788
|
+
pos++;
|
789
|
+
}
|
790
|
+
if(!(*comment)) (*comment)=&(line[pos]);
|
791
|
+
/* printf("Comment: '%s'\n",(*comment)); */
|
792
|
+
|
793
|
+
wpos=0;
|
794
|
+
/* check, that line starts with target value or zero, but not with
|
795
|
+
feature pair */
|
796
|
+
if(sscanf(line,"%s",featurepair) == EOF) return(0);
|
797
|
+
pos=0;
|
798
|
+
while((featurepair[pos] != ':') && featurepair[pos]) pos++;
|
799
|
+
if(featurepair[pos] == ':') {
|
800
|
+
perror ("Line must start with label or 0!!!\n");
|
801
|
+
printf("LINE: %s\n",line);
|
802
|
+
exit (1);
|
803
|
+
}
|
804
|
+
/* read the target value */
|
805
|
+
if(sscanf(line,"%lf",label) == EOF) return(0);
|
806
|
+
pos=0;
|
807
|
+
while(space_or_null((int)line[pos])) pos++;
|
808
|
+
while((!space_or_null((int)line[pos])) && line[pos]) pos++;
|
809
|
+
while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
|
810
|
+
(numread > 0) &&
|
811
|
+
(wpos<max_words_doc)) {
|
812
|
+
/* printf("%s\n",featurepair); */
|
813
|
+
while(space_or_null((int)line[pos])) pos++;
|
814
|
+
while((!space_or_null((int)line[pos])) && line[pos]) pos++;
|
815
|
+
if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
|
816
|
+
/* it is the query id */
|
817
|
+
(*queryid)=(long)wnum;
|
818
|
+
}
|
819
|
+
else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
|
820
|
+
/* it is the slack id */
|
821
|
+
if(wnum > 0)
|
822
|
+
(*slackid)=(long)wnum;
|
823
|
+
else {
|
824
|
+
perror ("Slack-id must be greater or equal to 1!!!\n");
|
825
|
+
printf("LINE: %s\n",line);
|
826
|
+
exit (1);
|
827
|
+
}
|
828
|
+
}
|
829
|
+
else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
|
830
|
+
/* it is the example-dependent cost factor */
|
831
|
+
(*costfactor)=(double)weight;
|
832
|
+
}
|
833
|
+
else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
|
834
|
+
/* it is a regular feature */
|
835
|
+
if(wnum<=0) {
|
836
|
+
perror ("Feature numbers must be larger or equal to 1!!!\n");
|
837
|
+
printf("LINE: %s\n",line);
|
838
|
+
exit (1);
|
839
|
+
}
|
840
|
+
if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
|
841
|
+
perror ("Features must be in increasing order!!!\n");
|
842
|
+
printf("LINE: %s\n",line);
|
843
|
+
exit (1);
|
844
|
+
}
|
845
|
+
(words[wpos]).wnum=wnum;
|
846
|
+
(words[wpos]).weight=(FVAL)weight;
|
847
|
+
wpos++;
|
848
|
+
}
|
849
|
+
else {
|
850
|
+
perror ("Cannot parse feature/value pair!!!\n");
|
851
|
+
printf("'%s' in LINE: %s\n",featurepair,line);
|
852
|
+
exit (1);
|
853
|
+
}
|
854
|
+
}
|
855
|
+
(words[wpos]).wnum=0;
|
856
|
+
(*numwords)=wpos+1;
|
857
|
+
return(1);
|
858
|
+
}
|
859
|
+
|
860
|
+
double *read_alphas(char *alphafile,long totdoc)
|
861
|
+
/* reads the alpha vector from a file as written by the
|
862
|
+
write_alphas function */
|
863
|
+
{
|
864
|
+
FILE *fl;
|
865
|
+
double *alpha;
|
866
|
+
long dnum;
|
867
|
+
|
868
|
+
if ((fl = fopen (alphafile, "r")) == NULL)
|
869
|
+
{ perror (alphafile); exit (1); }
|
870
|
+
|
871
|
+
alpha = (double *)my_malloc(sizeof(double)*totdoc);
|
872
|
+
if(verbosity>=1) {
|
873
|
+
printf("Reading alphas..."); fflush(stdout);
|
874
|
+
}
|
875
|
+
dnum=0;
|
876
|
+
while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
|
877
|
+
dnum++;
|
878
|
+
}
|
879
|
+
if(dnum != totdoc)
|
880
|
+
{ perror ("\nNot enough values in alpha file!"); exit (1); }
|
881
|
+
fclose(fl);
|
882
|
+
|
883
|
+
if(verbosity>=1) {
|
884
|
+
printf("done\n"); fflush(stdout);
|
885
|
+
}
|
886
|
+
|
887
|
+
return(alpha);
|
888
|
+
}
|
889
|
+
|
890
|
+
void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
|
891
|
+
/* Grep through file and count number of lines, maximum number of
|
892
|
+
spaces per line, and longest line. */
|
893
|
+
{
|
894
|
+
FILE *fl;
|
895
|
+
int ic;
|
896
|
+
char c;
|
897
|
+
long current_length,current_wol;
|
898
|
+
|
899
|
+
if ((fl = fopen (file, "r")) == NULL)
|
900
|
+
{ perror (file); exit (1); }
|
901
|
+
current_length=0;
|
902
|
+
current_wol=0;
|
903
|
+
(*ll)=0;
|
904
|
+
(*nol)=1;
|
905
|
+
(*wol)=0;
|
906
|
+
while((ic=getc(fl)) != EOF) {
|
907
|
+
c=(char)ic;
|
908
|
+
current_length++;
|
909
|
+
if(space_or_null((int)c)) {
|
910
|
+
current_wol++;
|
911
|
+
}
|
912
|
+
if(c == '\n') {
|
913
|
+
(*nol)++;
|
914
|
+
if(current_length>(*ll)) {
|
915
|
+
(*ll)=current_length;
|
916
|
+
}
|
917
|
+
if(current_wol>(*wol)) {
|
918
|
+
(*wol)=current_wol;
|
919
|
+
}
|
920
|
+
current_length=0;
|
921
|
+
current_wol=0;
|
922
|
+
}
|
923
|
+
}
|
924
|
+
fclose(fl);
|
925
|
+
}
|
926
|
+
|
927
|
+
long minl(long int a, long int b)
|
928
|
+
{
|
929
|
+
if(a<b)
|
930
|
+
return(a);
|
931
|
+
else
|
932
|
+
return(b);
|
933
|
+
}
|
934
|
+
|
935
|
+
long maxl(long int a, long int b)
|
936
|
+
{
|
937
|
+
if(a>b)
|
938
|
+
return(a);
|
939
|
+
else
|
940
|
+
return(b);
|
941
|
+
}
|
942
|
+
|
943
|
+
long get_runtime(void)
|
944
|
+
{
|
945
|
+
clock_t start;
|
946
|
+
start = clock();
|
947
|
+
return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
|
948
|
+
}
|
949
|
+
|
950
|
+
|
951
|
+
# ifdef _MSC_VER
|
952
|
+
|
953
|
+
int isnan(double a)
|
954
|
+
{
|
955
|
+
return(_isnan(a));
|
956
|
+
}
|
957
|
+
|
958
|
+
# endif
|
959
|
+
|
960
|
+
int space_or_null(int c) {
|
961
|
+
if (c==0)
|
962
|
+
return 1;
|
963
|
+
return isspace((unsigned char)c);
|
964
|
+
}
|
965
|
+
|
966
|
+
void *my_malloc(size_t size)
|
967
|
+
{
|
968
|
+
void *ptr;
|
969
|
+
if(size<=0) size=1; /* for AIX compatibility */
|
970
|
+
ptr=(void *)malloc(size);
|
971
|
+
if(!ptr) {
|
972
|
+
perror ("Out of memory!\n");
|
973
|
+
exit (1);
|
974
|
+
}
|
975
|
+
return(ptr);
|
976
|
+
}
|
977
|
+
|
978
|
+
void copyright_notice(void)
|
979
|
+
{
|
980
|
+
printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
|
981
|
+
printf("This software is available for non-commercial use only. It must not\n");
|
982
|
+
printf("be modified and distributed without prior permission of the author.\n");
|
983
|
+
printf("The author is not responsible for implications from the use of this\n");
|
984
|
+
printf("software.\n\n");
|
985
|
+
}
|