svmlightcli 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,197 @@
1
+ /***********************************************************************/
2
+ /* */
3
+ /* svm_classify.c */
4
+ /* */
5
+ /* Classification module of Support Vector Machine. */
6
+ /* */
7
+ /* Author: Thorsten Joachims */
8
+ /* Date: 02.07.02 */
9
+ /* */
10
+ /* Copyright (c) 2002 Thorsten Joachims - All rights reserved */
11
+ /* */
12
+ /* This software is available for non-commercial use only. It must */
13
+ /* not be modified and distributed without prior permission of the */
14
+ /* author. The author is not responsible for implications from the */
15
+ /* use of this software. */
16
+ /* */
17
+ /************************************************************************/
18
+
19
+ # include "svm_common.h"
20
+
21
+ char docfile[200];
22
+ char modelfile[200];
23
+ char predictionsfile[200];
24
+
25
+ void read_input_parameters(int, char **, char *, char *, char *, long *,
26
+ long *);
27
+ void print_help(void);
28
+
29
+
30
+ int main (int argc, char* argv[])
31
+ {
32
+ DOC *doc; /* test example */
33
+ WORD *words;
34
+ long max_docs,max_words_doc,lld;
35
+ long totdoc=0,queryid,slackid;
36
+ long correct=0,incorrect=0,no_accuracy=0;
37
+ long res_a=0,res_b=0,res_c=0,res_d=0,wnum,pred_format;
38
+ long j;
39
+ double t1,runtime=0;
40
+ double dist,doc_label,costfactor;
41
+ char *line,*comment;
42
+ FILE *predfl,*docfl;
43
+ MODEL *model;
44
+
45
+ read_input_parameters(argc,argv,docfile,modelfile,predictionsfile,
46
+ &verbosity,&pred_format);
47
+
48
+ nol_ll(docfile,&max_docs,&max_words_doc,&lld); /* scan size of input file */
49
+ max_words_doc+=2;
50
+ lld+=2;
51
+
52
+ line = (char *)my_malloc(sizeof(char)*lld);
53
+ words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
54
+
55
+ model=read_model(modelfile);
56
+
57
+ if(model->kernel_parm.kernel_type == 0) { /* linear kernel */
58
+ /* compute weight vector */
59
+ add_weight_vector_to_linear_model(model);
60
+ }
61
+
62
+ if(verbosity>=2) {
63
+ printf("Classifying test examples.."); fflush(stdout);
64
+ }
65
+
66
+ if ((docfl = fopen (docfile, "r")) == NULL)
67
+ { perror (docfile); exit (1); }
68
+ if ((predfl = fopen (predictionsfile, "w")) == NULL)
69
+ { perror (predictionsfile); exit (1); }
70
+
71
+ while((!feof(docfl)) && fgets(line,(int)lld,docfl)) {
72
+ if(line[0] == '#') continue; /* line contains comments */
73
+ parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,&wnum,
74
+ max_words_doc,&comment);
75
+ totdoc++;
76
+ if(model->kernel_parm.kernel_type == 0) { /* linear kernel */
77
+ for(j=0;(words[j]).wnum != 0;j++) { /* Check if feature numbers */
78
+ if((words[j]).wnum>model->totwords) /* are not larger than in */
79
+ (words[j]).wnum=0; /* model. Remove feature if */
80
+ } /* necessary. */
81
+ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0));
82
+ t1=get_runtime();
83
+ dist=classify_example_linear(model,doc);
84
+ runtime+=(get_runtime()-t1);
85
+ free_example(doc,1);
86
+ }
87
+ else { /* non-linear kernel */
88
+ doc = create_example(-1,0,0,0.0,create_svector(words,comment,1.0));
89
+ t1=get_runtime();
90
+ dist=classify_example(model,doc);
91
+ runtime+=(get_runtime()-t1);
92
+ free_example(doc,1);
93
+ }
94
+ if(dist>0) {
95
+ if(pred_format==0) { /* old weired output format */
96
+ fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist);
97
+ }
98
+ if(doc_label>0) correct++; else incorrect++;
99
+ if(doc_label>0) res_a++; else res_b++;
100
+ }
101
+ else {
102
+ if(pred_format==0) { /* old weired output format */
103
+ fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist);
104
+ }
105
+ if(doc_label<0) correct++; else incorrect++;
106
+ if(doc_label>0) res_c++; else res_d++;
107
+ }
108
+ if(pred_format==1) { /* output the value of decision function */
109
+ fprintf(predfl,"%.8g\n",dist);
110
+ }
111
+ if((int)(0.01+(doc_label*doc_label)) != 1)
112
+ { no_accuracy=1; } /* test data is not binary labeled */
113
+ if(verbosity>=2) {
114
+ if(totdoc % 100 == 0) {
115
+ printf("%ld..",totdoc); fflush(stdout);
116
+ }
117
+ }
118
+ }
119
+ fclose(predfl);
120
+ fclose(docfl);
121
+ free(line);
122
+ free(words);
123
+ free_model(model,1);
124
+
125
+ if(verbosity>=2) {
126
+ printf("done\n");
127
+
128
+ /* Note by Gary Boone Date: 29 April 2000 */
129
+ /* o Timing is inaccurate. The timer has 0.01 second resolution. */
130
+ /* Because classification of a single vector takes less than */
131
+ /* 0.01 secs, the timer was underflowing. */
132
+ printf("Runtime (without IO) in cpu-seconds: %.2f\n",
133
+ (float)(runtime/100.0));
134
+
135
+ }
136
+ if((!no_accuracy) && (verbosity>=1)) {
137
+ printf("Accuracy on test set: %.2f%% (%ld correct, %ld incorrect, %ld total)\n",(float)(correct)*100.0/totdoc,correct,incorrect,totdoc);
138
+ printf("Precision/recall on test set: %.2f%%/%.2f%%\n",(float)(res_a)*100.0/(res_a+res_b),(float)(res_a)*100.0/(res_a+res_c));
139
+ }
140
+
141
+ return(0);
142
+ }
143
+
144
+ void read_input_parameters(int argc, char **argv, char *docfile,
145
+ char *modelfile, char *predictionsfile,
146
+ long int *verbosity, long int *pred_format)
147
+ {
148
+ long i;
149
+
150
+ /* set default */
151
+ strcpy (modelfile, "svm_model");
152
+ strcpy (predictionsfile, "svm_predictions");
153
+ (*verbosity)=2;
154
+ (*pred_format)=1;
155
+
156
+ for(i=1;(i<argc) && ((argv[i])[0] == '-');i++) {
157
+ switch ((argv[i])[1])
158
+ {
159
+ case 'h': print_help(); exit(0);
160
+ case 'v': i++; (*verbosity)=atol(argv[i]); break;
161
+ case 'f': i++; (*pred_format)=atol(argv[i]); break;
162
+ default: printf("\nUnrecognized option %s!\n\n",argv[i]);
163
+ print_help();
164
+ exit(0);
165
+ }
166
+ }
167
+ if((i+1)>=argc) {
168
+ printf("\nNot enough input parameters!\n\n");
169
+ print_help();
170
+ exit(0);
171
+ }
172
+ strcpy (docfile, argv[i]);
173
+ strcpy (modelfile, argv[i+1]);
174
+ if((i+2)<argc) {
175
+ strcpy (predictionsfile, argv[i+2]);
176
+ }
177
+ if(((*pred_format) != 0) && ((*pred_format) != 1)) {
178
+ printf("\nOutput format can only take the values 0 or 1!\n\n");
179
+ print_help();
180
+ exit(0);
181
+ }
182
+ }
183
+
184
+ void print_help(void)
185
+ {
186
+ printf("\nSVM-light %s: Support Vector Machine, classification module %s\n",VERSION,VERSION_DATE);
187
+ copyright_notice();
188
+ printf(" usage: svm_classify [options] example_file model_file output_file\n\n");
189
+ printf("options: -h -> this help\n");
190
+ printf(" -v [0..3] -> verbosity level (default 2)\n");
191
+ printf(" -f [0,1] -> 0: old output format of V1.0\n");
192
+ printf(" -> 1: output the value of decision function (default)\n\n");
193
+ }
194
+
195
+
196
+
197
+
@@ -0,0 +1,985 @@
1
+ /************************************************************************/
2
+ /* */
3
+ /* svm_common.c */
4
+ /* */
5
+ /* Definitions and functions used in both svm_learn and svm_classify. */
6
+ /* */
7
+ /* Author: Thorsten Joachims */
8
+ /* Date: 02.07.04 */
9
+ /* */
10
+ /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */
11
+ /* */
12
+ /* This software is available for non-commercial use only. It must */
13
+ /* not be modified and distributed without prior permission of the */
14
+ /* author. The author is not responsible for implications from the */
15
+ /* use of this software. */
16
+ /* */
17
+ /************************************************************************/
18
+
19
+ # include "ctype.h"
20
+ # include "svm_common.h"
21
+ # include "kernel.h" /* this contains a user supplied kernel */
22
+
23
+ long verbosity; /* verbosity level (0-4) */
24
+ long kernel_cache_statistic;
25
+
26
+ double classify_example(MODEL *model, DOC *ex)
27
+ /* classifies one example */
28
+ {
29
+ register long i;
30
+ register double dist;
31
+
32
+ if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
33
+ return(classify_example_linear(model,ex));
34
+
35
+ dist=0;
36
+ for(i=1;i<model->sv_num;i++) {
37
+ dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
38
+ }
39
+ return(dist-model->b);
40
+ }
41
+
42
+ double classify_example_linear(MODEL *model, DOC *ex)
43
+ /* classifies example for linear kernel */
44
+
45
+ /* important: the model must have the linear weight vector computed */
46
+ /* use: add_weight_vector_to_linear_model(&model); */
47
+
48
+
49
+ /* important: the feature numbers in the example to classify must */
50
+ /* not be larger than the weight vector! */
51
+ {
52
+ double sum=0;
53
+ SVECTOR *f;
54
+
55
+ for(f=ex->fvec;f;f=f->next)
56
+ sum+=f->factor*sprod_ns(model->lin_weights,f);
57
+ return(sum-model->b);
58
+ }
59
+
60
+
61
+ double kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
62
+ /* calculate the kernel function */
63
+ {
64
+ double sum=0;
65
+ SVECTOR *fa,*fb;
66
+
67
+ /* in case the constraints are sums of feature vector as represented
68
+ as a list of SVECTOR's with their coefficient factor in the sum,
69
+ take the kernel between all pairs */
70
+ for(fa=a->fvec;fa;fa=fa->next) {
71
+ for(fb=b->fvec;fb;fb=fb->next) {
72
+ if(fa->kernel_id == fb->kernel_id)
73
+ sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
74
+ }
75
+ }
76
+ return(sum);
77
+ }
78
+
79
+ double single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b)
80
+ /* calculate the kernel function between two vectors */
81
+ {
82
+ kernel_cache_statistic++;
83
+ switch(kernel_parm->kernel_type) {
84
+ case 0: /* linear */
85
+ return(sprod_ss(a,b));
86
+ case 1: /* polynomial */
87
+ return(pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
88
+ case 2: /* radial basis function */
89
+ return(exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
90
+ case 3: /* sigmoid neural net */
91
+ return(tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const));
92
+ case 4: /* custom-kernel supplied in file kernel.h*/
93
+ return(custom_kernel(kernel_parm,a,b));
94
+ default: printf("Error: Unknown kernel function\n"); exit(1);
95
+ }
96
+ }
97
+
98
+
99
+ SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
100
+ {
101
+ SVECTOR *vec;
102
+ long fnum,i;
103
+
104
+ fnum=0;
105
+ while(words[fnum].wnum) {
106
+ fnum++;
107
+ }
108
+ fnum++;
109
+ vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
110
+ vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
111
+ for(i=0;i<fnum;i++) {
112
+ vec->words[i]=words[i];
113
+ }
114
+ vec->twonorm_sq=sprod_ss(vec,vec);
115
+
116
+ fnum=0;
117
+ while(userdefined[fnum]) {
118
+ fnum++;
119
+ }
120
+ fnum++;
121
+ vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
122
+ for(i=0;i<fnum;i++) {
123
+ vec->userdefined[i]=userdefined[i];
124
+ }
125
+ vec->kernel_id=0;
126
+ vec->next=NULL;
127
+ vec->factor=factor;
128
+ return(vec);
129
+ }
130
+
131
+ SVECTOR *copy_svector(SVECTOR *vec)
132
+ {
133
+ SVECTOR *newvec=NULL;
134
+ if(vec) {
135
+ newvec=create_svector(vec->words,vec->userdefined,vec->factor);
136
+ newvec->next=copy_svector(vec->next);
137
+ }
138
+ return(newvec);
139
+ }
140
+
141
+ void free_svector(SVECTOR *vec)
142
+ {
143
+ if(vec) {
144
+ free(vec->words);
145
+ if(vec->userdefined)
146
+ free(vec->userdefined);
147
+ free_svector(vec->next);
148
+ free(vec);
149
+ }
150
+ }
151
+
152
+ double sprod_ss(SVECTOR *a, SVECTOR *b)
153
+ /* compute the inner product of two sparse vectors */
154
+ {
155
+ register double sum=0;
156
+ register WORD *ai,*bj;
157
+ ai=a->words;
158
+ bj=b->words;
159
+ while (ai->wnum && bj->wnum) {
160
+ if(ai->wnum > bj->wnum) {
161
+ bj++;
162
+ }
163
+ else if (ai->wnum < bj->wnum) {
164
+ ai++;
165
+ }
166
+ else {
167
+ sum+=(ai->weight) * (bj->weight);
168
+ ai++;
169
+ bj++;
170
+ }
171
+ }
172
+ return((double)sum);
173
+ }
174
+
175
+ SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b)
176
+ /* compute the difference a-b of two sparse vectors */
177
+ /* Note: SVECTOR lists are not followed, but only the first
178
+ SVECTOR is used */
179
+ {
180
+ SVECTOR *vec;
181
+ register WORD *sum,*sumi;
182
+ register WORD *ai,*bj;
183
+ long veclength;
184
+
185
+ ai=a->words;
186
+ bj=b->words;
187
+ veclength=0;
188
+ while (ai->wnum && bj->wnum) {
189
+ if(ai->wnum > bj->wnum) {
190
+ veclength++;
191
+ bj++;
192
+ }
193
+ else if (ai->wnum < bj->wnum) {
194
+ veclength++;
195
+ ai++;
196
+ }
197
+ else {
198
+ veclength++;
199
+ ai++;
200
+ bj++;
201
+ }
202
+ }
203
+ while (bj->wnum) {
204
+ veclength++;
205
+ bj++;
206
+ }
207
+ while (ai->wnum) {
208
+ veclength++;
209
+ ai++;
210
+ }
211
+ veclength++;
212
+
213
+ sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
214
+ sumi=sum;
215
+ ai=a->words;
216
+ bj=b->words;
217
+ while (ai->wnum && bj->wnum) {
218
+ if(ai->wnum > bj->wnum) {
219
+ (*sumi)=(*bj);
220
+ sumi->weight*=(-1);
221
+ sumi++;
222
+ bj++;
223
+ }
224
+ else if (ai->wnum < bj->wnum) {
225
+ (*sumi)=(*ai);
226
+ sumi++;
227
+ ai++;
228
+ }
229
+ else {
230
+ (*sumi)=(*ai);
231
+ sumi->weight-=bj->weight;
232
+ if(sumi->weight != 0)
233
+ sumi++;
234
+ ai++;
235
+ bj++;
236
+ }
237
+ }
238
+ while (bj->wnum) {
239
+ (*sumi)=(*bj);
240
+ sumi->weight*=(-1);
241
+ sumi++;
242
+ bj++;
243
+ }
244
+ while (ai->wnum) {
245
+ (*sumi)=(*ai);
246
+ sumi++;
247
+ ai++;
248
+ }
249
+ sumi->wnum=0;
250
+
251
+ vec=create_svector(sum,"",1.0);
252
+ free(sum);
253
+
254
+ return(vec);
255
+ }
256
+
257
+ SVECTOR* add_ss(SVECTOR *a, SVECTOR *b)
258
+ /* compute the sum a+b of two sparse vectors */
259
+ /* Note: SVECTOR lists are not followed, but only the first
260
+ SVECTOR is used */
261
+ {
262
+ SVECTOR *vec;
263
+ register WORD *sum,*sumi;
264
+ register WORD *ai,*bj;
265
+ long veclength;
266
+
267
+ ai=a->words;
268
+ bj=b->words;
269
+ veclength=0;
270
+ while (ai->wnum && bj->wnum) {
271
+ if(ai->wnum > bj->wnum) {
272
+ veclength++;
273
+ bj++;
274
+ }
275
+ else if (ai->wnum < bj->wnum) {
276
+ veclength++;
277
+ ai++;
278
+ }
279
+ else {
280
+ veclength++;
281
+ ai++;
282
+ bj++;
283
+ }
284
+ }
285
+ while (bj->wnum) {
286
+ veclength++;
287
+ bj++;
288
+ }
289
+ while (ai->wnum) {
290
+ veclength++;
291
+ ai++;
292
+ }
293
+ veclength++;
294
+
295
+ /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
296
+
297
+ sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
298
+ sumi=sum;
299
+ ai=a->words;
300
+ bj=b->words;
301
+ while (ai->wnum && bj->wnum) {
302
+ if(ai->wnum > bj->wnum) {
303
+ (*sumi)=(*bj);
304
+ sumi++;
305
+ bj++;
306
+ }
307
+ else if (ai->wnum < bj->wnum) {
308
+ (*sumi)=(*ai);
309
+ sumi++;
310
+ ai++;
311
+ }
312
+ else {
313
+ (*sumi)=(*ai);
314
+ sumi->weight+=bj->weight;
315
+ if(sumi->weight != 0)
316
+ sumi++;
317
+ ai++;
318
+ bj++;
319
+ }
320
+ }
321
+ while (bj->wnum) {
322
+ (*sumi)=(*bj);
323
+ sumi++;
324
+ bj++;
325
+ }
326
+ while (ai->wnum) {
327
+ (*sumi)=(*ai);
328
+ sumi++;
329
+ ai++;
330
+ }
331
+ sumi->wnum=0;
332
+
333
+ vec=create_svector(sum,"",1.0);
334
+ free(sum);
335
+
336
+ return(vec);
337
+ }
338
+
339
+ SVECTOR* add_list_ss(SVECTOR *a)
340
+ /* computes the linear combination of the SVECTOR list weighted
341
+ by the factor of each SVECTOR */
342
+ {
343
+ SVECTOR *scaled,*oldsum,*sum,*f;
344
+ WORD empty[2];
345
+
346
+ if(a){
347
+ sum=smult_s(a,a->factor);
348
+ for(f=a->next;f;f=f->next) {
349
+ scaled=smult_s(f,f->factor);
350
+ oldsum=sum;
351
+ sum=add_ss(sum,scaled);
352
+ free_svector(oldsum);
353
+ free_svector(scaled);
354
+ }
355
+ sum->factor=1.0;
356
+ }
357
+ else {
358
+ empty[0].wnum=0;
359
+ sum=create_svector(empty,"",1.0);
360
+ }
361
+ return(sum);
362
+ }
363
+
364
+ void append_svector_list(SVECTOR *a, SVECTOR *b)
365
+ /* appends SVECTOR b to the end of SVECTOR a. */
366
+ {
367
+ SVECTOR *f;
368
+
369
+ for(f=a;f->next;f=f->next); /* find end of first vector list */
370
+ f->next=b; /* append the two vector lists */
371
+ }
372
+
373
+ SVECTOR* smult_s(SVECTOR *a, double factor)
374
+ /* scale sparse vector a by factor */
375
+ {
376
+ SVECTOR *vec;
377
+ register WORD *sum,*sumi;
378
+ register WORD *ai;
379
+ long veclength;
380
+
381
+ ai=a->words;
382
+ veclength=0;
383
+ while (ai->wnum) {
384
+ veclength++;
385
+ ai++;
386
+ }
387
+ veclength++;
388
+
389
+ sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
390
+ sumi=sum;
391
+ ai=a->words;
392
+ while (ai->wnum) {
393
+ (*sumi)=(*ai);
394
+ sumi->weight*=factor;
395
+ if(sumi->weight != 0)
396
+ sumi++;
397
+ ai++;
398
+ }
399
+ sumi->wnum=0;
400
+
401
+ vec=create_svector(sum,a->userdefined,a->factor);
402
+ free(sum);
403
+
404
+ return(vec);
405
+ }
406
+
407
+ int featvec_eq(SVECTOR *a, SVECTOR *b)
408
+ /* tests two sparse vectors for equality */
409
+ {
410
+ register WORD *ai,*bj;
411
+ ai=a->words;
412
+ bj=b->words;
413
+ while (ai->wnum && bj->wnum) {
414
+ if(ai->wnum > bj->wnum) {
415
+ if((bj->weight) != 0)
416
+ return(0);
417
+ bj++;
418
+ }
419
+ else if (ai->wnum < bj->wnum) {
420
+ if((ai->weight) != 0)
421
+ return(0);
422
+ ai++;
423
+ }
424
+ else {
425
+ if((ai->weight) != (bj->weight))
426
+ return(0);
427
+ ai++;
428
+ bj++;
429
+ }
430
+ }
431
+ return(1);
432
+ }
433
+
434
+ double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
435
+ /* compute length of weight vector */
436
+ {
437
+ register long i,j;
438
+ register double sum=0,alphai;
439
+ register DOC *supveci;
440
+
441
+ for(i=1;i<model->sv_num;i++) {
442
+ alphai=model->alpha[i];
443
+ supveci=model->supvec[i];
444
+ for(j=1;j<model->sv_num;j++) {
445
+ sum+=alphai*model->alpha[j]
446
+ *kernel(kernel_parm,supveci,model->supvec[j]);
447
+ }
448
+ }
449
+ return(sqrt(sum));
450
+ }
451
+
452
+ void clear_vector_n(double *vec, long int n)
453
+ {
454
+ register long i;
455
+ for(i=0;i<=n;i++) vec[i]=0;
456
+ }
457
+
458
+ void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
459
+ {
460
+ register WORD *ai;
461
+ ai=vec_s->words;
462
+ while (ai->wnum) {
463
+ vec_n[ai->wnum]+=(faktor*ai->weight);
464
+ ai++;
465
+ }
466
+ }
467
+
468
+ double sprod_ns(double *vec_n, SVECTOR *vec_s)
469
+ {
470
+ register double sum=0;
471
+ register WORD *ai;
472
+ ai=vec_s->words;
473
+ while (ai->wnum) {
474
+ sum+=(vec_n[ai->wnum]*ai->weight);
475
+ ai++;
476
+ }
477
+ return(sum);
478
+ }
479
+
480
+ void add_weight_vector_to_linear_model(MODEL *model)
481
+ /* compute weight vector in linear case and add to model */
482
+ {
483
+ long i;
484
+ SVECTOR *f;
485
+
486
+ model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
487
+ clear_vector_n(model->lin_weights,model->totwords);
488
+ for(i=1;i<model->sv_num;i++) {
489
+ for(f=(model->supvec[i])->fvec;f;f=f->next)
490
+ add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
491
+ }
492
+ }
493
+
494
+
495
+ DOC *create_example(long docnum, long queryid, long slackid,
496
+ double costfactor, SVECTOR *fvec)
497
+ {
498
+ DOC *example;
499
+ example = (DOC *)my_malloc(sizeof(DOC));
500
+ example->docnum=docnum;
501
+ example->queryid=queryid;
502
+ example->slackid=slackid;
503
+ example->costfactor=costfactor;
504
+ example->fvec=fvec;
505
+ return(example);
506
+ }
507
+
508
+ void free_example(DOC *example, long deep)
509
+ {
510
+ if(example) {
511
+ if(deep) {
512
+ if(example->fvec)
513
+ free_svector(example->fvec);
514
+ }
515
+ free(example);
516
+ }
517
+ }
518
+
519
+ void write_model(char *modelfile, MODEL *model)
520
+ {
521
+ FILE *modelfl;
522
+ long j,i,sv_num;
523
+ SVECTOR *v;
524
+
525
+ if(verbosity>=1) {
526
+ printf("Writing model file..."); fflush(stdout);
527
+ }
528
+ if ((modelfl = fopen (modelfile, "w")) == NULL)
529
+ { perror (modelfile); exit (1); }
530
+ fprintf(modelfl,"SVM-light Version %s\n",VERSION);
531
+ fprintf(modelfl,"%ld # kernel type\n",
532
+ model->kernel_parm.kernel_type);
533
+ fprintf(modelfl,"%ld # kernel parameter -d \n",
534
+ model->kernel_parm.poly_degree);
535
+ fprintf(modelfl,"%.8g # kernel parameter -g \n",
536
+ model->kernel_parm.rbf_gamma);
537
+ fprintf(modelfl,"%.8g # kernel parameter -s \n",
538
+ model->kernel_parm.coef_lin);
539
+ fprintf(modelfl,"%.8g # kernel parameter -r \n",
540
+ model->kernel_parm.coef_const);
541
+ fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
542
+ fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
543
+ fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
544
+
545
+ sv_num=1;
546
+ for(i=1;i<model->sv_num;i++) {
547
+ for(v=model->supvec[i]->fvec;v;v=v->next)
548
+ sv_num++;
549
+ }
550
+ fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
551
+ fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
552
+
553
+ for(i=1;i<model->sv_num;i++) {
554
+ for(v=model->supvec[i]->fvec;v;v=v->next) {
555
+ fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
556
+ for (j=0; (v->words[j]).wnum; j++) {
557
+ fprintf(modelfl,"%ld:%.8g ",
558
+ (long)(v->words[j]).wnum,
559
+ (double)(v->words[j]).weight);
560
+ }
561
+ fprintf(modelfl,"#%s\n",v->userdefined);
562
+ /* NOTE: this could be made more efficient by summing the
563
+ alpha's of identical vectors before writing them to the
564
+ file. */
565
+ }
566
+ }
567
+ fclose(modelfl);
568
+ if(verbosity>=1) {
569
+ printf("done\n");
570
+ }
571
+ }
572
+
573
+
574
+ MODEL *read_model(char *modelfile)
575
+ {
576
+ FILE *modelfl;
577
+ long i,queryid,slackid;
578
+ double costfactor;
579
+ long max_sv,max_words,ll,wpos;
580
+ char *line,*comment;
581
+ WORD *words;
582
+ char version_buffer[100];
583
+ MODEL *model;
584
+
585
+ if(verbosity>=1) {
586
+ printf("Reading model..."); fflush(stdout);
587
+ }
588
+
589
+ nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
590
+ max_words+=2;
591
+ ll+=2;
592
+
593
+ words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
594
+ line = (char *)my_malloc(sizeof(char)*ll);
595
+ model = (MODEL *)my_malloc(sizeof(MODEL));
596
+
597
+ if ((modelfl = fopen (modelfile, "r")) == NULL)
598
+ { perror (modelfile); exit (1); }
599
+
600
+ fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
601
+ if(strcmp(version_buffer,VERSION)) {
602
+ perror ("Version of model-file does not match version of svm_classify!");
603
+ exit (1);
604
+ }
605
+ fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
606
+ fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
607
+ fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
608
+ fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
609
+ fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
610
+ fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
611
+
612
+ fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
613
+ fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
614
+ fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
615
+ fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
616
+
617
+ model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
618
+ model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
619
+ model->index=NULL;
620
+ model->lin_weights=NULL;
621
+
622
+ for(i=1;i<model->sv_num;i++) {
623
+ fgets(line,(int)ll,modelfl);
624
+ if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
625
+ &costfactor,&wpos,max_words,&comment)) {
626
+ printf("\nParsing error while reading model file in SV %ld!\n%s",
627
+ i,line);
628
+ exit(1);
629
+ }
630
+ model->supvec[i] = create_example(-1,
631
+ 0,0,
632
+ 0.0,
633
+ create_svector(words,comment,1.0));
634
+ }
635
+ fclose(modelfl);
636
+ free(line);
637
+ free(words);
638
+ if(verbosity>=1) {
639
+ fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
640
+ }
641
+ return(model);
642
+ }
643
+
644
+ MODEL *copy_model(MODEL *model)
645
+ {
646
+ MODEL *newmodel;
647
+ long i;
648
+
649
+ newmodel=(MODEL *)my_malloc(sizeof(MODEL));
650
+ (*newmodel)=(*model);
651
+ newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
652
+ newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
653
+ newmodel->index = NULL; /* index is not copied */
654
+ newmodel->supvec[0] = NULL;
655
+ newmodel->alpha[0] = 0;
656
+ for(i=1;i<model->sv_num;i++) {
657
+ newmodel->alpha[i]=model->alpha[i];
658
+ newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
659
+ model->supvec[i]->queryid,0,
660
+ model->supvec[i]->costfactor,
661
+ copy_svector(model->supvec[i]->fvec));
662
+ }
663
+ if(model->lin_weights) {
664
+ newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
665
+ for(i=0;i<model->totwords+1;i++)
666
+ newmodel->lin_weights[i]=model->lin_weights[i];
667
+ }
668
+ return(newmodel);
669
+ }
670
+
671
+ void free_model(MODEL *model, int deep)
672
+ {
673
+ long i;
674
+
675
+ if(model->supvec) {
676
+ if(deep) {
677
+ for(i=1;i<model->sv_num;i++) {
678
+ free_example(model->supvec[i],1);
679
+ }
680
+ }
681
+ free(model->supvec);
682
+ }
683
+ if(model->alpha) free(model->alpha);
684
+ if(model->index) free(model->index);
685
+ if(model->lin_weights) free(model->lin_weights);
686
+ free(model);
687
+ }
688
+
689
+
690
+ void read_documents(char *docfile, DOC ***docs, double **label,
691
+ long int *totwords, long int *totdoc)
692
+ {
693
+ char *line,*comment;
694
+ WORD *words;
695
+ long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
696
+ long max_words_doc, ll;
697
+ double doc_label,costfactor;
698
+ FILE *docfl;
699
+
700
+ if(verbosity>=1) {
701
+ printf("Scanning examples..."); fflush(stdout);
702
+ }
703
+ nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
704
+ max_words_doc+=2;
705
+ ll+=2;
706
+ max_docs+=2;
707
+ if(verbosity>=1) {
708
+ printf("done\n"); fflush(stdout);
709
+ }
710
+
711
+ (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
712
+ (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
713
+ line = (char *)my_malloc(sizeof(char)*ll);
714
+
715
+ if ((docfl = fopen (docfile, "r")) == NULL)
716
+ { perror (docfile); exit (1); }
717
+
718
+ words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
719
+ if(verbosity>=1) {
720
+ printf("Reading examples into memory..."); fflush(stdout);
721
+ }
722
+ dnum=0;
723
+ (*totwords)=0;
724
+ while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
725
+ if(line[0] == '#') continue; /* line contains comments */
726
+ if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
727
+ &wpos,max_words_doc,&comment)) {
728
+ printf("\nParsing error in line %ld!\n%s",dnum,line);
729
+ exit(1);
730
+ }
731
+ (*label)[dnum]=doc_label;
732
+ /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
733
+ if(doc_label > 0) dpos++;
734
+ if (doc_label < 0) dneg++;
735
+ if (doc_label == 0) dunlab++;
736
+ if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
737
+ (*totwords)=(words[wpos-2]).wnum;
738
+ if((*totwords) > MAXFEATNUM) {
739
+ printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
740
+ printf("LINE: %s\n",line);
741
+ exit(1);
742
+ }
743
+ (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
744
+ create_svector(words,comment,1.0));
745
+ /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
746
+ dnum++;
747
+ if(verbosity>=1) {
748
+ if((dnum % 100) == 0) {
749
+ printf("%ld..",dnum); fflush(stdout);
750
+ }
751
+ }
752
+ }
753
+
754
+ fclose(docfl);
755
+ free(line);
756
+ free(words);
757
+ if(verbosity>=1) {
758
+ fprintf(stdout, "OK. (%ld examples read)\n", dnum);
759
+ }
760
+ (*totdoc)=dnum;
761
+ }
762
+
763
+ int parse_document(char *line, WORD *words, double *label,
764
+ long *queryid, long *slackid, double *costfactor,
765
+ long int *numwords, long int max_words_doc,
766
+ char **comment)
767
+ {
768
+ register long wpos,pos;
769
+ long wnum;
770
+ double weight;
771
+ int numread;
772
+ char featurepair[1000],junk[1000];
773
+
774
+ (*queryid)=0;
775
+ (*slackid)=0;
776
+ (*costfactor)=1;
777
+
778
+ pos=0;
779
+ (*comment)=NULL;
780
+ while(line[pos] ) { /* cut off comments */
781
+ if((line[pos] == '#') && (!(*comment))) {
782
+ line[pos]=0;
783
+ (*comment)=&(line[pos+1]);
784
+ }
785
+ if(line[pos] == '\n') { /* strip the CR */
786
+ line[pos]=0;
787
+ }
788
+ pos++;
789
+ }
790
+ if(!(*comment)) (*comment)=&(line[pos]);
791
+ /* printf("Comment: '%s'\n",(*comment)); */
792
+
793
+ wpos=0;
794
+ /* check, that line starts with target value or zero, but not with
795
+ feature pair */
796
+ if(sscanf(line,"%s",featurepair) == EOF) return(0);
797
+ pos=0;
798
+ while((featurepair[pos] != ':') && featurepair[pos]) pos++;
799
+ if(featurepair[pos] == ':') {
800
+ perror ("Line must start with label or 0!!!\n");
801
+ printf("LINE: %s\n",line);
802
+ exit (1);
803
+ }
804
+ /* read the target value */
805
+ if(sscanf(line,"%lf",label) == EOF) return(0);
806
+ pos=0;
807
+ while(space_or_null((int)line[pos])) pos++;
808
+ while((!space_or_null((int)line[pos])) && line[pos]) pos++;
809
+ while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
810
+ (numread > 0) &&
811
+ (wpos<max_words_doc)) {
812
+ /* printf("%s\n",featurepair); */
813
+ while(space_or_null((int)line[pos])) pos++;
814
+ while((!space_or_null((int)line[pos])) && line[pos]) pos++;
815
+ if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
816
+ /* it is the query id */
817
+ (*queryid)=(long)wnum;
818
+ }
819
+ else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
820
+ /* it is the slack id */
821
+ if(wnum > 0)
822
+ (*slackid)=(long)wnum;
823
+ else {
824
+ perror ("Slack-id must be greater or equal to 1!!!\n");
825
+ printf("LINE: %s\n",line);
826
+ exit (1);
827
+ }
828
+ }
829
+ else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
830
+ /* it is the example-dependent cost factor */
831
+ (*costfactor)=(double)weight;
832
+ }
833
+ else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
834
+ /* it is a regular feature */
835
+ if(wnum<=0) {
836
+ perror ("Feature numbers must be larger or equal to 1!!!\n");
837
+ printf("LINE: %s\n",line);
838
+ exit (1);
839
+ }
840
+ if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
841
+ perror ("Features must be in increasing order!!!\n");
842
+ printf("LINE: %s\n",line);
843
+ exit (1);
844
+ }
845
+ (words[wpos]).wnum=wnum;
846
+ (words[wpos]).weight=(FVAL)weight;
847
+ wpos++;
848
+ }
849
+ else {
850
+ perror ("Cannot parse feature/value pair!!!\n");
851
+ printf("'%s' in LINE: %s\n",featurepair,line);
852
+ exit (1);
853
+ }
854
+ }
855
+ (words[wpos]).wnum=0;
856
+ (*numwords)=wpos+1;
857
+ return(1);
858
+ }
859
+
860
+ double *read_alphas(char *alphafile,long totdoc)
861
+ /* reads the alpha vector from a file as written by the
862
+ write_alphas function */
863
+ {
864
+ FILE *fl;
865
+ double *alpha;
866
+ long dnum;
867
+
868
+ if ((fl = fopen (alphafile, "r")) == NULL)
869
+ { perror (alphafile); exit (1); }
870
+
871
+ alpha = (double *)my_malloc(sizeof(double)*totdoc);
872
+ if(verbosity>=1) {
873
+ printf("Reading alphas..."); fflush(stdout);
874
+ }
875
+ dnum=0;
876
+ while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
877
+ dnum++;
878
+ }
879
+ if(dnum != totdoc)
880
+ { perror ("\nNot enough values in alpha file!"); exit (1); }
881
+ fclose(fl);
882
+
883
+ if(verbosity>=1) {
884
+ printf("done\n"); fflush(stdout);
885
+ }
886
+
887
+ return(alpha);
888
+ }
889
+
890
+ void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
891
+ /* Grep through file and count number of lines, maximum number of
892
+ spaces per line, and longest line. */
893
+ {
894
+ FILE *fl;
895
+ int ic;
896
+ char c;
897
+ long current_length,current_wol;
898
+
899
+ if ((fl = fopen (file, "r")) == NULL)
900
+ { perror (file); exit (1); }
901
+ current_length=0;
902
+ current_wol=0;
903
+ (*ll)=0;
904
+ (*nol)=1;
905
+ (*wol)=0;
906
+ while((ic=getc(fl)) != EOF) {
907
+ c=(char)ic;
908
+ current_length++;
909
+ if(space_or_null((int)c)) {
910
+ current_wol++;
911
+ }
912
+ if(c == '\n') {
913
+ (*nol)++;
914
+ if(current_length>(*ll)) {
915
+ (*ll)=current_length;
916
+ }
917
+ if(current_wol>(*wol)) {
918
+ (*wol)=current_wol;
919
+ }
920
+ current_length=0;
921
+ current_wol=0;
922
+ }
923
+ }
924
+ fclose(fl);
925
+ }
926
+
927
+ long minl(long int a, long int b)
928
+ {
929
+ if(a<b)
930
+ return(a);
931
+ else
932
+ return(b);
933
+ }
934
+
935
+ long maxl(long int a, long int b)
936
+ {
937
+ if(a>b)
938
+ return(a);
939
+ else
940
+ return(b);
941
+ }
942
+
943
+ long get_runtime(void)
944
+ {
945
+ clock_t start;
946
+ start = clock();
947
+ return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
948
+ }
949
+
950
+
951
+ # ifdef _MSC_VER
952
+
953
+ int isnan(double a)
954
+ {
955
+ return(_isnan(a));
956
+ }
957
+
958
+ # endif
959
+
960
+ int space_or_null(int c) {
961
+ if (c==0)
962
+ return 1;
963
+ return isspace((unsigned char)c);
964
+ }
965
+
966
+ void *my_malloc(size_t size)
967
+ {
968
+ void *ptr;
969
+ if(size<=0) size=1; /* for AIX compatibility */
970
+ ptr=(void *)malloc(size);
971
+ if(!ptr) {
972
+ perror ("Out of memory!\n");
973
+ exit (1);
974
+ }
975
+ return(ptr);
976
+ }
977
+
978
+ void copyright_notice(void)
979
+ {
980
+ printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
981
+ printf("This software is available for non-commercial use only. It must not\n");
982
+ printf("be modified and distributed without prior permission of the author.\n");
983
+ printf("The author is not responsible for implications from the use of this\n");
984
+ printf("software.\n\n");
985
+ }