nysol-take 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +7 -0
  2. data/bin/mbiclique.rb +317 -0
  3. data/bin/mbipolish.rb +362 -0
  4. data/bin/mccomp.rb +235 -0
  5. data/bin/mclique.rb +295 -0
  6. data/bin/mclique2g.rb +105 -0
  7. data/bin/mcliqueInfo.rb +203 -0
  8. data/bin/mfriends.rb +202 -0
  9. data/bin/mgdiff.rb +252 -0
  10. data/bin/mhifriend.rb +456 -0
  11. data/bin/mhipolish.rb +465 -0
  12. data/bin/mitemset.rb +168 -0
  13. data/bin/mpal.rb +410 -0
  14. data/bin/mpolishing.rb +399 -0
  15. data/bin/msequence.rb +165 -0
  16. data/bin/mtra2g.rb +476 -0
  17. data/bin/mtra2gc.rb +360 -0
  18. data/ext/grhfilrun/extconf.rb +12 -0
  19. data/ext/grhfilrun/grhfilrun.c +85 -0
  20. data/ext/grhfilrun/src/_sspc.c +358 -0
  21. data/ext/grhfilrun/src/aheap.c +545 -0
  22. data/ext/grhfilrun/src/aheap.h +251 -0
  23. data/ext/grhfilrun/src/base.c +92 -0
  24. data/ext/grhfilrun/src/base.h +59 -0
  25. data/ext/grhfilrun/src/fstar.c +497 -0
  26. data/ext/grhfilrun/src/fstar.h +80 -0
  27. data/ext/grhfilrun/src/grhfil.c +214 -0
  28. data/ext/grhfilrun/src/itemset.c +713 -0
  29. data/ext/grhfilrun/src/itemset.h +170 -0
  30. data/ext/grhfilrun/src/problem.c +415 -0
  31. data/ext/grhfilrun/src/problem.h +179 -0
  32. data/ext/grhfilrun/src/queue.c +533 -0
  33. data/ext/grhfilrun/src/queue.h +182 -0
  34. data/ext/grhfilrun/src/sample.c +19 -0
  35. data/ext/grhfilrun/src/sspc.c +597 -0
  36. data/ext/grhfilrun/src/sspc2.c +491 -0
  37. data/ext/grhfilrun/src/stdlib2.c +1482 -0
  38. data/ext/grhfilrun/src/stdlib2.h +892 -0
  39. data/ext/grhfilrun/src/trsact.c +817 -0
  40. data/ext/grhfilrun/src/trsact.h +160 -0
  41. data/ext/grhfilrun/src/vec.c +745 -0
  42. data/ext/grhfilrun/src/vec.h +172 -0
  43. data/ext/lcmrun/extconf.rb +20 -0
  44. data/ext/lcmrun/lcmrun.cpp +99 -0
  45. data/ext/lcmrun/src/aheap.c +216 -0
  46. data/ext/lcmrun/src/aheap.h +111 -0
  47. data/ext/lcmrun/src/base.c +92 -0
  48. data/ext/lcmrun/src/base.h +59 -0
  49. data/ext/lcmrun/src/itemset.c +496 -0
  50. data/ext/lcmrun/src/itemset.h +157 -0
  51. data/ext/lcmrun/src/lcm.c +427 -0
  52. data/ext/lcmrun/src/problem.c +349 -0
  53. data/ext/lcmrun/src/problem.h +177 -0
  54. data/ext/lcmrun/src/queue.c +528 -0
  55. data/ext/lcmrun/src/queue.h +176 -0
  56. data/ext/lcmrun/src/sgraph.c +359 -0
  57. data/ext/lcmrun/src/sgraph.h +173 -0
  58. data/ext/lcmrun/src/stdlib2.c +1282 -0
  59. data/ext/lcmrun/src/stdlib2.h +823 -0
  60. data/ext/lcmrun/src/trsact.c +747 -0
  61. data/ext/lcmrun/src/trsact.h +159 -0
  62. data/ext/lcmrun/src/vec.c +731 -0
  63. data/ext/lcmrun/src/vec.h +171 -0
  64. data/ext/lcmseq0run/extconf.rb +20 -0
  65. data/ext/lcmseq0run/lcmseq0run.cpp +59 -0
  66. data/ext/lcmseq0run/src/aheap.c +216 -0
  67. data/ext/lcmseq0run/src/aheap.h +111 -0
  68. data/ext/lcmseq0run/src/base.c +92 -0
  69. data/ext/lcmseq0run/src/base.h +59 -0
  70. data/ext/lcmseq0run/src/itemset.c +518 -0
  71. data/ext/lcmseq0run/src/itemset.h +157 -0
  72. data/ext/lcmseq0run/src/itemset_zero.c +522 -0
  73. data/ext/lcmseq0run/src/lcm_seq.c +446 -0
  74. data/ext/lcmseq0run/src/lcm_seq_zero.c +446 -0
  75. data/ext/lcmseq0run/src/problem.c +439 -0
  76. data/ext/lcmseq0run/src/problem.h +179 -0
  77. data/ext/lcmseq0run/src/problem_zero.c +439 -0
  78. data/ext/lcmseq0run/src/queue.c +533 -0
  79. data/ext/lcmseq0run/src/queue.h +182 -0
  80. data/ext/lcmseq0run/src/stdlib2.c +1350 -0
  81. data/ext/lcmseq0run/src/stdlib2.h +864 -0
  82. data/ext/lcmseq0run/src/trsact.c +747 -0
  83. data/ext/lcmseq0run/src/trsact.h +159 -0
  84. data/ext/lcmseq0run/src/vec.c +779 -0
  85. data/ext/lcmseq0run/src/vec.h +172 -0
  86. data/ext/lcmseqrun/extconf.rb +20 -0
  87. data/ext/lcmseqrun/lcmseqrun.cpp +101 -0
  88. data/ext/lcmseqrun/src/aheap.c +216 -0
  89. data/ext/lcmseqrun/src/aheap.h +111 -0
  90. data/ext/lcmseqrun/src/base.c +92 -0
  91. data/ext/lcmseqrun/src/base.h +59 -0
  92. data/ext/lcmseqrun/src/itemset.c +518 -0
  93. data/ext/lcmseqrun/src/itemset.h +157 -0
  94. data/ext/lcmseqrun/src/itemset_zero.c +522 -0
  95. data/ext/lcmseqrun/src/lcm_seq.c +447 -0
  96. data/ext/lcmseqrun/src/lcm_seq_zero.c +446 -0
  97. data/ext/lcmseqrun/src/problem.c +439 -0
  98. data/ext/lcmseqrun/src/problem.h +179 -0
  99. data/ext/lcmseqrun/src/problem_zero.c +439 -0
  100. data/ext/lcmseqrun/src/queue.c +533 -0
  101. data/ext/lcmseqrun/src/queue.h +182 -0
  102. data/ext/lcmseqrun/src/stdlib2.c +1350 -0
  103. data/ext/lcmseqrun/src/stdlib2.h +864 -0
  104. data/ext/lcmseqrun/src/trsact.c +747 -0
  105. data/ext/lcmseqrun/src/trsact.h +159 -0
  106. data/ext/lcmseqrun/src/vec.c +779 -0
  107. data/ext/lcmseqrun/src/vec.h +172 -0
  108. data/ext/lcmtransrun/extconf.rb +18 -0
  109. data/ext/lcmtransrun/lcmtransrun.cpp +264 -0
  110. data/ext/macerun/extconf.rb +20 -0
  111. data/ext/macerun/macerun.cpp +57 -0
  112. data/ext/macerun/src/aheap.c +217 -0
  113. data/ext/macerun/src/aheap.h +112 -0
  114. data/ext/macerun/src/itemset.c +491 -0
  115. data/ext/macerun/src/itemset.h +158 -0
  116. data/ext/macerun/src/mace.c +503 -0
  117. data/ext/macerun/src/problem.c +346 -0
  118. data/ext/macerun/src/problem.h +174 -0
  119. data/ext/macerun/src/queue.c +529 -0
  120. data/ext/macerun/src/queue.h +177 -0
  121. data/ext/macerun/src/sgraph.c +360 -0
  122. data/ext/macerun/src/sgraph.h +174 -0
  123. data/ext/macerun/src/stdlib2.c +993 -0
  124. data/ext/macerun/src/stdlib2.h +811 -0
  125. data/ext/macerun/src/vec.c +634 -0
  126. data/ext/macerun/src/vec.h +170 -0
  127. data/ext/sspcrun/extconf.rb +20 -0
  128. data/ext/sspcrun/src/_sspc.c +358 -0
  129. data/ext/sspcrun/src/aheap.c +545 -0
  130. data/ext/sspcrun/src/aheap.h +251 -0
  131. data/ext/sspcrun/src/base.c +92 -0
  132. data/ext/sspcrun/src/base.h +59 -0
  133. data/ext/sspcrun/src/fstar.c +496 -0
  134. data/ext/sspcrun/src/fstar.h +80 -0
  135. data/ext/sspcrun/src/grhfil.c +213 -0
  136. data/ext/sspcrun/src/itemset.c +713 -0
  137. data/ext/sspcrun/src/itemset.h +170 -0
  138. data/ext/sspcrun/src/problem.c +415 -0
  139. data/ext/sspcrun/src/problem.h +179 -0
  140. data/ext/sspcrun/src/queue.c +533 -0
  141. data/ext/sspcrun/src/queue.h +182 -0
  142. data/ext/sspcrun/src/sample.c +19 -0
  143. data/ext/sspcrun/src/sspc.c +598 -0
  144. data/ext/sspcrun/src/sspc2.c +491 -0
  145. data/ext/sspcrun/src/stdlib2.c +1482 -0
  146. data/ext/sspcrun/src/stdlib2.h +892 -0
  147. data/ext/sspcrun/src/trsact.c +817 -0
  148. data/ext/sspcrun/src/trsact.h +160 -0
  149. data/ext/sspcrun/src/vec.c +745 -0
  150. data/ext/sspcrun/src/vec.h +172 -0
  151. data/ext/sspcrun/sspcrun.cpp +54 -0
  152. data/lib/nysol/enumLcmEp.rb +338 -0
  153. data/lib/nysol/enumLcmEsp.rb +284 -0
  154. data/lib/nysol/enumLcmIs.rb +275 -0
  155. data/lib/nysol/enumLcmSeq.rb +143 -0
  156. data/lib/nysol/items.rb +201 -0
  157. data/lib/nysol/seqDB.rb +256 -0
  158. data/lib/nysol/take.rb +39 -0
  159. data/lib/nysol/taxonomy.rb +113 -0
  160. data/lib/nysol/traDB.rb +257 -0
  161. metadata +239 -0
@@ -0,0 +1,597 @@
1
+ /* SSPC: Similar Set Pair Comparison */
2
+ /* 2007/11/30 Takeaki Uno, e-mail:uno@nii.jp,
3
+ homepage: http://research.nii.ac.jp/~uno/index.html */
4
+ /* This program is available for only academic use, basically.
5
+ Anyone can modify this program, but he/she has to write down
6
+ the change of the modification on the top of the source code.
7
+ Neither contact nor appointment to Takeaki Uno is needed.
8
+ If one wants to re-distribute this code, do not forget to
9
+ refer the newest code, and show the link to homepage of
10
+ Takeaki Uno, to notify the news about this code for the users.
11
+ For the commercial use, please make a contact to Takeaki Uno. */
12
+
13
+ /* internal_params.l1 = #solutions
14
+ internal_params.l2 = #rows
15
+ internal_params.l3 = #columns
16
+ */
17
+
18
+
19
+ #ifndef _sspc_c_
20
+ #define _sspc_c_
21
+
22
+ #define WEIGHT_DOUBLE
23
+
24
+ #include"trsact.c"
25
+ #include"problem.c"
26
+
27
+ #define SSPC_INCLUSION 1
28
+ #define SSPC_SIMILARITY 2
29
+ #define SSPC_INTERSECTION 4
30
+ #define SSPC_RESEMBLANCE 8
31
+ #define SSPC_INNERPRODUCT 16
32
+ #define SSPC_MININT 32
33
+ #define SSPC_MAXINT 64
34
+ #define SSPC_PMI 128
35
+ #define SSPC_COUNT 2048
36
+ #define SSPC_MATRIX 4096
37
+ #define SSPC_UNIFY 8192
38
+ #define SSPC_NO_NEIB 16384
39
+ #define SSPC_POLISH 32768
40
+ #define SSPC_POLISH2 65536
41
+ #define SSPC_OUTPUT_INTERSECT 131072
42
+
43
+ void SSPC_error (){
44
+ ERROR_MES = "command explanation";
45
+ print_err ("SSPC: [ISCfQq] [options] input-filename ratio/threshold [output-filename]\n\
46
+ %%:show progress, _:no message, +:write solutions in append mode\n\
47
+ #:count the number of similar records for each record\n\
48
+ i(inclusion): find pairs [ratio] of items (weighted sum) of one is included in the other (1st is included in 2nd)\n\
49
+ I(both-inclusion): find pairs s.t. the size (weight sum) of intersection is [ratio] of both\n\
50
+ S:set similarity measure to |A\\cap B| / max{|A|,|B|}\n\
51
+ s:set similarity measure to |A\\cap B| / min{|A|,|B|}\n\
52
+ T(intersection): find pairs having common [threshld] items\n\
53
+ R(resemblance): find pairs s.t. |A\\capB|/|A\\cupB| >= [threshld]\n\
54
+ P(PMI): set similarity measure to log (|A\\capB|*|all| / (|A|*|B|)) where |all| is the number of all items\n\
55
+ C(cosign distance): find pairs s.t. inner product of their normalized vectors >= [threshld]\n\
56
+ f,Q:output ratio/size of pairs following/preceding to the pairs\n\
57
+ D:the first entry is ID, and unify the records with the same ID\n\
58
+ N:normalize the ID of latter sets, in -c mode\n\
59
+ n:do not consider a and b in the set when comparing a and b\n\
60
+ Y(y):output elements of each set that contribute to no similarity (y:fast with much memory use)\n\
61
+ 1:remove duplicated items in each transaction\n\
62
+ t:transpose the database so that i-th transaction will be item i\n\
63
+ E:input column-row representation\n\
64
+ w:load weight of each item in each row (with E command)\n\
65
+ [options]\n\
66
+ -2 [num]:2nd input file name\n\
67
+ -9 [th] [filename]:write pairs satisfies 2nd threshold [th] to file [filename]\n\
68
+ -K [num]:output [num] pairs of most large similarities\n\
69
+ -k [num]:output [num] elements of most large similarities, for each element\n\
70
+ -w [filename]:read item weights from [filename]\n\
71
+ -W [filename]:read item weights in each row from [filename]\n\
72
+ -l,-u [num]:ignore transactions with size (weight sum) less/more than [num]\n\
73
+ -L,-U [num]: ignore items appearing less/more than [num]\n\
74
+ -c [num]:compare transactions of IDs less than num and the others (if 0 is given, automatically set to the boundary of the 1st and 2nd file)\n\
75
+ -b [num]:ignore pairs having no common item of at least [num]th frequency\n\
76
+ -B [num]:ignore pairs having no common item of frequency at most [num]\n\
77
+ -T [num]:ignore pairs whose intersection size is less than [num]\n\
78
+ (-TT [num]: -T with outputting intersection size to the 1st column of each line\n\
79
+ -# [num]:stop after outputting [num] solutions\n\
80
+ -, [char]:give the separator of the numbers in the output\n\
81
+ -Q [filename]:replace the output numbers according to the permutation table given by [filename]\n\
82
+ # the 1st letter of input-filename cannot be '-'.\n\
83
+ # if the output file name is -, the solutions will be output to standard output.\n");
84
+ EXIT;
85
+ //items have to begin from 1\n");
86
+ //E:read edge list file\n
87
+ }
88
+
89
+ // c:multi stream transaction mode, separated by an empty transaction
90
+
91
+ /***********************************************************************/
92
+ /* read parameters given by command line */
93
+ /***********************************************************************/
94
+ void SSPC_read_param (int argc, char *argv[], PROBLEM *PP){
95
+ int c=1;
96
+ ITEMSET *II = &PP->II;
97
+ TRSACT *TT = &PP->TT;
98
+ PP->th = 0; PP->th2 = 0;
99
+
100
+ if ( argc < c+3 ){ SSPC_error (); return; }
101
+
102
+ if ( !strchr (argv[c], '_') ){ II->flag |= SHOW_MESSAGE; TT->flag |= SHOW_MESSAGE; }
103
+ if ( strchr (argv[c], '%') ) II->flag |= SHOW_PROGRESS;
104
+ if ( strchr (argv[c], '+') ) II->flag |= ITEMSET_APPEND;
105
+ if ( strchr (argv[c], 'f') ) II->flag |= ITEMSET_FREQ;
106
+ if ( strchr (argv[c], 'Q') ) II->flag |= ITEMSET_PRE_FREQ;
107
+ if ( strchr (argv[c], 'M') ) PP->problem |= SSPC_MATRIX;
108
+ if ( strchr (argv[c], 'i') ) PP->problem = SSPC_INCLUSION;
109
+ else if ( strchr (argv[c], 'I') ) PP->problem = SSPC_SIMILARITY;
110
+ else if ( strchr (argv[c], 'T') ) PP->problem = SSPC_INTERSECTION;
111
+ else if ( strchr (argv[c], 's') ) PP->problem = SSPC_MININT;
112
+ else if ( strchr (argv[c], 'S') ) PP->problem = SSPC_MAXINT;
113
+ else if ( strchr (argv[c], 'R') ) PP->problem = SSPC_RESEMBLANCE;
114
+ else if ( strchr (argv[c], 'P') ) PP->problem = SSPC_PMI;
115
+ else if ( strchr (argv[c], 'C') ) PP->problem = SSPC_INNERPRODUCT;
116
+ else error ("i, I, s, S, R, T or C command has to be specified", EXIT);
117
+ if ( strchr (argv[c], '#') ) PP->problem |= SSPC_COUNT;
118
+ if ( strchr (argv[c], 'N') ) PP->problem |= PROBLEM_NORMALIZE;
119
+ if ( strchr (argv[c], 'D') ) PP->problem |= SSPC_UNIFY;
120
+ if ( strchr (argv[c], 'n') ) PP->problem |= SSPC_NO_NEIB;
121
+ if ( strchr (argv[c], 'Y') ) PP->problem |= SSPC_POLISH;
122
+ if ( strchr (argv[c], 'y') ) PP->problem |= SSPC_POLISH2;
123
+ if ( !strchr (argv[c], 't') ) TT->flag |= LOAD_TPOSE;
124
+ if ( strchr (argv[c], 'E') ) TT->flag |= LOAD_ELE;
125
+ if ( strchr (argv[c], 'w') ) TT->flag |= LOAD_EDGEW;
126
+ if ( strchr (argv[c], '1') ) TT->flag |= LOAD_RM_DUP;
127
+ c++;
128
+
129
+ while ( argv[c][0] == '-' ){
130
+ switch (argv[c][1]){
131
+ case 'K': II->topk_k = atoi(argv[c+1]);
132
+ break; case 'k': II->itemtopk_item = atoi(argv[c+1]); II->itemtopk_item2 = 1;
133
+ break; case 'L': TT->row_lb = atoi(argv[c+1]);
134
+ break; case 'U': TT->row_ub = atoi(argv[c+1]);
135
+ break; case 'l': TT->w_lb = atof(argv[c+1]);
136
+ break; case 'u': TT->w_ub = atof(argv[c+1]);
137
+ break; case 'w': PP->TT.wfname = argv[c+1];
138
+ break; case 'W': PP->TT.item_wfname = argv[c+1];
139
+ break; case 'c': PP->dir = 1; TT->sep = atoi(argv[c+1]);
140
+ break; case '2': PP->TT.fname2 = argv[c+1];
141
+ break; case '9': PP->th2 = atof(argv[c+1]); c++; PP->output_fname2 = argv[c+1];
142
+ break; case 'b': PP->II.len_lb = atoi(argv[c+1]);
143
+ break; case 'B': PP->II.len_ub = atoi(argv[c+1]);
144
+ break; case 'T': PP->th = atoi(argv[c+1]);
145
+ if ( argv[c][2] == 'T' ){ PP->problem |= SSPC_OUTPUT_INTERSECT; }
146
+ break; case '#': II->max_solutions = atoi(argv[c+1]);
147
+ break; case ',': II->separator = argv[c+1][0];
148
+ break; case 'Q': PP->outperm_fname = argv[c+1];
149
+ break; default: goto NEXT;
150
+ }
151
+ c += 2;
152
+ if ( argc<c+2 ){ SSPC_error (); return; }
153
+ }
154
+
155
+ NEXT:;
156
+ if ( PP->problem & SSPC_MATRIX ) PP->MM.fname = argv[c];
157
+ else PP->TT.fname = argv[c];
158
+ II->frq_lb = atof(argv[c+1]);
159
+ if ( argc>c+2 ) PP->output_fname = argv[c+2];
160
+ }
161
+
162
+ void SSPC_output (PROBLEM *PP, QUEUE_INT *cnt, QUEUE_INT i, QUEUE_INT ii){
163
+ size_t b;
164
+ if ( PP->problem & (SSPC_POLISH+SSPC_POLISH2) ){
165
+ PP->vecchr[i] = 1;
166
+ if ( PP->problem & SSPC_POLISH2 ){ // store the solution
167
+ if ( (b=PP->itemary[PP->TT.T.clms]) ){
168
+ PP->itemary[PP->TT.T.clms] = PP->buf[b]; // use deleted cell
169
+ } else { // allocate new cell
170
+ b = PP->buf_end;
171
+ realloci (PP->buf, b+30, EXIT);
172
+ PP->buf_end += 2;
173
+ }
174
+ PP->buf[b] = PP->itemary[i]; // insert the cell to list i
175
+ PP->buf[b+1] = ii;
176
+ PP->itemary[i] = b;
177
+ }
178
+ } else if ( PP->problem & SSPC_COUNT ) (*cnt)++;
179
+ else {
180
+ if ( PP->problem & SSPC_OUTPUT_INTERSECT ){
181
+ FILE2_print_int (&PP->II.multi_fp[0], PP->siz, 0);
182
+ FILE2_putc (&PP->II.multi_fp[0], ' ');
183
+ }
184
+ if ( PP->II.itemtopk_end > 0 ){
185
+ PP->II.itemtopk_item = i; PP->II.itemtopk_item2 = ii;
186
+ ITEMSET_output_itemset (&PP->II, NULL, 0);
187
+ PP->II.itemtopk_item = ii; PP->II.itemtopk_item2 = i;
188
+ }
189
+ ITEMSET_output_itemset (&PP->II, NULL, 0);
190
+ }
191
+
192
+
193
+ }
194
+
195
+ /*************************************************************************/
196
+ /* SSPC main routine */
197
+ /*************************************************************************/
198
+ void SSPC (PROBLEM *PP){
199
+ ITEMSET *II = &PP->II;
200
+ TRSACT *TT = &PP->TT;
201
+ QUEUE J = INIT_QUEUE;
202
+ QUEUE_ID i, j, begin = (PP->problem&(SSPC_POLISH+SSPC_POLISH2))?0:(PP->dir>0?TT->sep:1);
203
+ QUEUE_ID f, f_=0, ii=0, m, t, ff;
204
+ QUEUE_INT *x, **o = NULL, *oi, *oj, cnt, id=0;
205
+ WEIGHT *w, f1, f2, f1_=0, f2_=0, c, cc, *y, yy=0;
206
+ char *u = NULL, *mark = NULL;
207
+ int pf = TT->flag2&TRSACT_NEGATIVE;
208
+ double sq =0;
209
+ int count=0, fs=SSPC_INTERSECTION +SSPC_RESEMBLANCE +SSPC_INNERPRODUCT +SSPC_MAXINT +SSPC_MININT + SSPC_PMI;
210
+ size_t b, bb;
211
+ FILE *fp = NULL;
212
+
213
+ // initialization
214
+ calloc2 (w, TT->T.clms*2, EXIT);
215
+ if ( PP->problem & SSPC_NO_NEIB ) calloc2 (mark, TT->T.clms, EXIT);
216
+ if ( (PP->problem&SSPC_INNERPRODUCT) && !TT->T.w ) FLOOP (i, 0, TT->T.clms) TT->w[i] *= TT->w[i];
217
+ TRSACT_delivery (TT, &TT->jump, w, w+TT->T.clms, NULL, TT->T.clms);
218
+ // FLOOP (i, 0, PP->dir?TT->sep:1) TT->OQ[i].end = 0;
219
+ if ( (PP->problem & SSPC_INNERPRODUCT) && TT->T.w ){ // normalize the vectors for inner product
220
+ ARY_FILL (w, 0, TT->T.clms, 0);
221
+ FLOOP (t, 0, TT->T.t){
222
+ y = TT->T.w[t];
223
+ MQUE_FLOOP (TT->T.v[t], x){ w[*x] += (*y)*(*y); y++; }
224
+ }
225
+ FLOOP (i, 0, TT->T.clms) w[i] = sqrt(w[i]);
226
+ FLOOP (t, 0, TT->T.t){
227
+ y = TT->T.w[t];
228
+ MQUE_FLOOP (TT->T.v[t], x){ *y /= w[*x]; y++; }
229
+ }
230
+ }
231
+ FLOOP (i, 0, TT->T.clms) TT->OQ[i].end = 0;
232
+ II->itemset.t = 2;
233
+ if ( PP->output_fname2 ) fopen2 (fp, PP->output_fname2, "w", EXIT);
234
+
235
+ // skipping items of large frequencies
236
+ if ( TT->flag & LOAD_SIZSORT ){
237
+ malloc2 (o, TT->T.clms, EXIT);
238
+ FLOOP (i, 0, TT->T.clms){
239
+ o[i] = TT->OQ[i].v;
240
+ TT->OQ[i].v[TT->OQ[i].t] = INTHUGE; // put end-mark at the last; also used in main loop
241
+ for (j=0 ; TT->OQ[i].v[j] < PP->II.len_lb ; j++);
242
+ TT->OQ[i].v = &TT->OQ[i].v[j]; TT->OQ[i].t -= j;
243
+ }
244
+ }
245
+
246
+ if (PP->problem & SSPC_UNIFY){
247
+ FLOOP (i, 0, TT->T.t){ // first entry is ID
248
+ id = TT->T.v[i].v[0];
249
+ FLOOP (ii, 0, TT->T.v[i].t-1) TT->T.v[i].v[ii] = TT->T.v[i].v[ii+1];
250
+ TT->T.v[i].t--;
251
+ TT->T.v[i].v[TT->T.v[i].t] = id;
252
+ }
253
+ QUEUE_alloc (&J, TT->T.clms);
254
+ calloc2 (u, TT->T.clms, EXIT);
255
+ ARY_FILL (w, 0, TT->T.clms*2, 0); // weight is to be re-computed
256
+ id = TT->T.t; J.t = J.s = 0;
257
+ FLOOP (i, 0, TT->T.t){
258
+ if ( id != TT->T.v[i].v[TT->T.v[i].t] ){
259
+ id = TT->T.v[i].v[TT->T.v[i].t];
260
+ MQUE_FLOOP (J, x) u[*x] = 0;
261
+ J.t = J.s = 0;
262
+ }
263
+
264
+ MQUE_FLOOP (TT->T.v[i], x){
265
+ if ( u[*x] == 0 ){ QUE_INS (J, *x); u[*x] = 1; }
266
+ else continue;
267
+ w[*x] += TT->w[i]; w[*x+TT->T.clms] += TT->pw[i];
268
+ }
269
+ }
270
+ MQUE_FLOOP (J, x){ u[*x] = 0; }
271
+ J.t = J.s = 0;
272
+ }
273
+
274
+ // main loop
275
+ FLOOP (i, begin, TT->T.clms){
276
+ //printf ("## %d\n", i);
277
+ if ( II->flag & SHOW_PROGRESS ){
278
+ if ( count < i*100/TT->T.clms ){ count++; fprintf (stderr, "%d%%\n", count); }
279
+ }
280
+ cnt = 0;
281
+ II->itemset.v[0] = ((PP->problem&PROBLEM_NORMALIZE)&& PP->dir>0)? i-TT->sep: i;
282
+ if ( (PP->problem & SSPC_INNERPRODUCT) && !TT->T.w ) sq = sqrt (w[i]);
283
+
284
+ // delivery
285
+ if ( PP->problem & (SSPC_POLISH+SSPC_POLISH2) ) m = PP->TT.T.clms;
286
+ else m = (PP->dir>0)?TT->sep:i;
287
+ TT->jump.t = TT->jump.s;
288
+ if (PP->problem & SSPC_UNIFY) id = TT->T.v[i].v[TT->T.v[TT->OQ[i].s].t];
289
+ FLOOP (ii, TT->OQ[i].s, TT->OQ[i].t){
290
+ t = TT->OQ[i].v[ii];
291
+ if ( TT->T.w && (PP->problem & SSPC_INNERPRODUCT)){ // get item weight of current vector
292
+ y = TT->T.w[t];
293
+ MQUE_MLOOP (TT->T.v[t], x, i) y++;
294
+ yy = *y;
295
+ }
296
+ if (PP->problem & SSPC_NO_NEIB){ mark[t] = 1; } // for no_neib
297
+ if ( TT->T.w ) y = TT->T.w[t]; else y = 0;
298
+ if (PP->problem & SSPC_UNIFY){ // for unify
299
+ if ( id != TT->T.v[t].v[TT->T.v[t].t] ){
300
+ id = TT->T.v[t].v[TT->T.v[t].t];
301
+ if ( t != TT->OQ[i].s ){
302
+ MQUE_FLOOP (J, x) u[*x] = 0;
303
+ J.t = J.s = 0;
304
+ }
305
+ }
306
+ } // unify end
307
+
308
+ MQUE_MLOOP (TT->T.v[t], x, m){
309
+ if ( (PP->problem & SSPC_POLISH2) && *x < i) continue;
310
+ if ( TT->OQ[*x].end == 0 ){
311
+ QUE_INS (TT->jump, *x);
312
+ PP->occ_w[*x] = 0;
313
+ if ( pf ) PP->occ_pw[*x] = 0;
314
+ }
315
+ if (PP->problem & SSPC_UNIFY){
316
+ if ( u[*x] == 0 ){ QUE_INS (J, *x); u[*x] = 1; }
317
+ else continue;
318
+ }
319
+ TT->OQ[*x].end++;
320
+ if ( TT->T.w ){
321
+ if (PP->problem & SSPC_INNERPRODUCT){
322
+ PP->occ_w[*x] += (*y) * yy; if ( *y>0 && pf) PP->occ_pw[*x] += (*y) * yy;
323
+ } else { PP->occ_w[*x] += *y; if ( *y>0 && pf) PP->occ_pw[*x] += *y; }
324
+ y++;
325
+ } else {
326
+ PP->occ_w[*x] += TT->w[t]; if ( pf ) PP->occ_pw[*x] += TT->pw[t];
327
+ }
328
+ }
329
+ }
330
+
331
+ if (PP->problem & SSPC_UNIFY){ // for unify
332
+ MQUE_FLOOP (J, x) u[*x] = 0;
333
+ J.t = J.s = 0;
334
+ } // unify end
335
+
336
+ MQUE_FLOOP (TT->jump, x){
337
+ if ( *x == i ) goto SKIP;
338
+ II->itemset.v[1] = *x;
339
+ c = PP->occ_w[*x];
340
+
341
+ if ( TT->flag & LOAD_SIZSORT ){
342
+ for (oi=o[i],oj=o[*x] ; *oi<PP->II.len_lb ; oi++ ){
343
+ while ( *oj < *oi ) oj++;
344
+ if ( *oi == *oj ) c += TT->w[*oi];
345
+ }
346
+ }
347
+ if (PP->problem & SSPC_NO_NEIB){ // for no_neib
348
+ if ( mark[*x] ){ w[*x] -= TT->w[i]; w[i] -= TT->w[*x]; }
349
+ }
350
+ if ( c < PP->th ) goto SKIP; // threshold for the intersection size
351
+ PP->siz = c; // outputting intersection size
352
+ if ( PP->problem & fs ){
353
+ if ( PP->problem & SSPC_INTERSECTION ) II->frq = c;
354
+ else if ( PP->problem & SSPC_INNERPRODUCT ){
355
+ if ( TT->T.w ) II->frq = c;
356
+ else II->frq = c / sq / sqrt(w[*x]);
357
+ } else if ( (PP->problem & SSPC_RESEMBLANCE) && (cc= w[i] +w[*x] -c) != 0 ) II->frq = c/cc;
358
+ else if ( (PP->problem & SSPC_MAXINT) && (cc=MAX(w[i],w[*x])) != 0 ) II->frq = c/cc;
359
+ else if ( (PP->problem & SSPC_MININT) && (cc=MIN(w[i],w[*x])) != 0 ) II->frq = c/cc;
360
+ else if ( (PP->problem & SSPC_PMI) && (cc=w[i]*w[*x]) != 0 ) II->frq = log( ( c * TT->T.t) / cc ) / -log ( c / TT->T.t);
361
+ else continue;
362
+ if ( II->frq >= II->frq_lb ) SSPC_output (PP, &cnt, *x, i);
363
+ if ( PP->output_fname2 && II->frq >= PP->th2 ) fprintf (fp, "%d %d, %3f, %3f\n", II->itemset.v[0], II->itemset.v[1], w[i], w[*x]);
364
+ } else {
365
+ f1 = w[i]*II->frq_lb; f2 = w[*x]*II->frq_lb; // size of i and *x
366
+ if ( PP->output_fname2 ){ f1_ = w[i]*PP->th2; f2_ = w[*x]*PP->th2; }
367
+ if ( PP->problem & SSPC_SIMILARITY ){
368
+ f = ( (c >= f1) && (c >= f2) );
369
+ if ( PP->output_fname2 ) f_ = ( (c >= f1_) && (c >= f2_) );
370
+ II->frq = MIN(c/w[i], c/w[*x]);
371
+ } else if ( PP->problem & SSPC_INCLUSION ){
372
+ if ( c >= f2 ){
373
+ II->frq = c/w[*x];
374
+ II->itemset.v[0] = *x; II->itemset.v[1] = i-PP->root;
375
+ SSPC_output (PP, &cnt, *x, i);
376
+ II->itemset.v[0] = i-PP->root; II->itemset.v[1] = *x;
377
+ }
378
+ f = ( c >= f1 );
379
+ II->frq = c/w[i];
380
+ if ( PP->output_fname2 ){
381
+ if ( c >= f2_ ) fprintf (fp, "%d %d\n", II->itemset.v[1], II->itemset.v[0]);
382
+ f_ = (c >= f1_);
383
+ }
384
+ } else continue;
385
+ if ( f ) SSPC_output (PP, &cnt, *x, i);
386
+ if ( PP->output_fname2 && f_ ) fprintf (fp, "%d %d\n", II->itemset.v[0], II->itemset.v[1]);
387
+ }
388
+ SKIP:;
389
+ if (PP->problem & SSPC_NO_NEIB){ // for no_neib
390
+ if ( mark[*x] ){ w[*x] += TT->w[i]; w[i] += TT->w[*x]; }
391
+ }
392
+ TT->OQ[*x].end = 0;
393
+ }
394
+
395
+ if ( PP->problem & (SSPC_POLISH+SSPC_POLISH2) ){ // data polish; clear OQ, and marks
396
+ if ( PP->problem & SSPC_POLISH2 ) // data polish; clear OQ, and marks
397
+ for (b=PP->itemary[i] ; b ; b=PP->buf[b]) PP->vecchr[PP->buf[b+1]] = 1;
398
+ f = 0;
399
+ FLOOP (ii, TT->OQ[i].s, TT->OQ[i].t){
400
+ t = TT->OQ[i].v[ii]; ff = 0;
401
+ MQUE_MLOOP (TT->T.v[t], x, PP->TT.T.clms)
402
+ if ( PP->vecchr[*x] ){ ff = 1; break; }
403
+ if ( ff ){
404
+ FILE2_print_int (&PP->II.multi_fp[0], t, f);
405
+ f = PP->II.separator;
406
+ }
407
+ }
408
+ FILE2_putc (&PP->II.multi_fp[0], '\n');
409
+ FILE2_flush (&PP->II.multi_fp[0]);
410
+ MQUE_FLOOP (TT->jump, x) PP->vecchr[*x] = 0; // clear mark
411
+ if ( PP->problem & SSPC_POLISH2 ){ // data polish; clear OQ, and marks
412
+ for (b=PP->itemary[i] ; b ; b=bb){ // insert cells to deleted cell queue
413
+ bb = PP->buf[b];
414
+ PP->vecchr[PP->buf[b+1]] = 0;
415
+ PP->buf[b] = PP->itemary[PP->TT.T.clms];
416
+ PP->itemary[PP->TT.T.clms] = b;
417
+ }
418
+ }
419
+ }
420
+ /* else if ( PP->problem & SSPC_POLISH2 ){ // data polish; clear OQ, and marks
421
+ f = 0;
422
+ for (b=PP->itemary[i] ; b ; b=PP->buf[b]) PP->vecchr[PP->buf[b+1]] = 1;
423
+ FLOOP (ii, TT->OQ[i].s, TT->OQ[i].t){
424
+ t = TT->OQ[i].v[ii]; ff = 0;
425
+ MQUE_MLOOP (TT->T.v[t], x, m)
426
+ if ( PP->vecchr[*x] ){ ff = 1; break; }
427
+ if ( ff ){
428
+ FILE2_print_int (&PP->II.multi_fp[0], t, f);
429
+ f = PP->II.separator;
430
+ }
431
+ }
432
+ FILE2_putc (&PP->II.multi_fp[0], '\n');
433
+ FILE2_flush (&PP->II.multi_fp[0]);
434
+ MQUE_FLOOP (TT->jump, x) PP->vecchr[*x] = 0; // clear mark
435
+ }
436
+ */
437
+
438
+ if (PP->problem & SSPC_NO_NEIB) // for no_neib
439
+ FLOOP (ii, TT->OQ[i].s, TT->OQ[i].t){ mark[TT->OQ[i].v[ii]] = 0; }
440
+ TT->OQ[i].end = 0;
441
+ if ( PP->problem & SSPC_COUNT ){
442
+ while ( ii<II->perm[i] ){
443
+ FILE2_putc (&II->multi_fp[0], '\n');
444
+ FILE2_flush (&II->multi_fp[0]);
445
+ ii++;
446
+ }
447
+ FILE2_print_int (&II->multi_fp[0], cnt, 0);
448
+ FILE2_putc (&II->multi_fp[0], '\n');
449
+ FILE2_flush (&II->multi_fp[0]);
450
+ II->sc[2] += cnt;
451
+ ii++;
452
+ }
453
+ }
454
+
455
+ // termination
456
+ if ( TT->flag & LOAD_SIZSORT ){
457
+ FLOOP (i, 0, TT->T.clms){
458
+ TT->OQ[i].t += TT->OQ[i].v - o[i];
459
+ TT->OQ[i].v = o[i];
460
+ }
461
+ }
462
+ mfree (w, o, u, mark);
463
+ QUEUE_end (&J);
464
+ if ( PP->output_fname2 ) fclose (fp);
465
+ }
466
+
467
+
468
+ /*************************************************************************/
469
+ /* SSPC matrix version */
470
+ /*************************************************************************/
471
+ void SSPCmat (PROBLEM *PP){
472
+ ITEMSET *II = &PP->II;
473
+ MAT *MM = &PP->MM;
474
+ QUEUE_ID i, j, x, begin = PP->dir>0?0:1, f, ii=0;
475
+ QUEUE_INT cnt;
476
+ WEIGHT *w, f1, f2, c, cc;
477
+ double sq =0;
478
+ int fs = SSPC_INTERSECTION +SSPC_RESEMBLANCE +SSPC_INNERPRODUCT+SSPC_MAXINT+SSPC_MININT;
479
+
480
+ II->frq_lb = II->frq_lb * II->frq_lb;
481
+
482
+ // initialization
483
+ // calloc2 (w, MM->t, EXIT);
484
+ // if ( PP->problem & SSPC_INNERPRODUCT ) FLOOP (i, 0, MM->clms) TT->w[i] *= TT->w[i];
485
+ // TRSACT_delivery (TT, &TT->jump, w, w+TT->T.clms, NULL, TT->T.clms);
486
+ // FLOOP (i, 0, PP.TT.T.clms) TT->OQ[i].end = 0;
487
+ II->itemset.t = 2;
488
+
489
+ // skipping items of large frequencies
490
+ // if ( TT->flag & LOAD_SIZSORT ){
491
+ // malloc2 (o, TT->T.clms, EXIT);
492
+ // FLOOP (i, 0, TT->T.clms){
493
+ // o[i] = TT->OQ[i].v;
494
+ // TT->OQ[i].v[TT->OQ[i].t] = INTHUGE; // put end-mark at the last; also used in main loop
495
+ // for ( j=0 ; TT->OQ[i].v[j] < PP->II.len_lb ; j++ );
496
+ // TT->OQ[i].v = &TT->OQ[i].v[j]; TT->OQ[i].t -= j;
497
+ // }
498
+ // }
499
+
500
+ // main loop
501
+ FLOOP (i, begin, MM->t){
502
+ cnt = 0;
503
+ II->itemset.v[0] = ((PP->problem&PROBLEM_NORMALIZE)&& PP->dir>0)? i-MM->clms: i; // i-TT->sep
504
+ if ( PP->problem || 1 ){
505
+ PP->occ_w[i] = 0;
506
+ FLOOP (x, 0, MM->clms) PP->occ_w[i] += MM->v[i].v[x] * MM->v[i].v[x];
507
+ }
508
+
509
+ FLOOP (j, 0, PP->dir>0?begin:i){
510
+ II->itemset.v[1] = j;
511
+ f = 0; sq = 0;
512
+ FLOOP (x, 0, MM->clms) sq += MM->v[i].v[x] * MM->v[j].v[x];
513
+ if ( sq / PP->occ_w[i] / PP->occ_w[j] > II->frq_lb ) f = 1;
514
+
515
+ if ( f ){
516
+ if ( PP->problem & SSPC_COUNT ) cnt++;
517
+ else ITEMSET_output_itemset (II, NULL, 0);
518
+ }
519
+ }
520
+ if ( PP->problem & SSPC_COUNT ){
521
+ while ( ii<II->perm[i] ){
522
+ FILE2_putc (&II->multi_fp[0], '\n');
523
+ FILE2_flush (&II->multi_fp[0]);
524
+ ii++;
525
+ }
526
+ FILE2_print_int (&II->multi_fp[0], cnt, 0);
527
+ FILE2_putc (&II->multi_fp[0], '\n');
528
+ FILE2_flush (&II->multi_fp[0]);
529
+ II->sc[2] += cnt;
530
+ ii++;
531
+ }
532
+ }
533
+
534
+ // termination
535
+ // mfree (w, o);
536
+ }
537
+
538
+
539
+
540
+ /*************************************************************************/
541
+ /* main function of SSPC */
542
+ /*************************************************************************/
543
+ int SSPC_main (int argc, char *argv[]){
544
+ PROBLEM PP;
545
+ SETFAMILY *T = &PP.TT.T;
546
+ QUEUE_ID i;
547
+
548
+ PROBLEM_init (&PP);
549
+ SSPC_read_param (argc, argv, &PP);
550
+ if ( ERROR_MES ) return (1);
551
+
552
+ PP.TT.flag |= LOAD_INCSORT;
553
+ PP.TT.flag2 |= TRSACT_ALLOC_OCC;
554
+ if ( PP.II.len_ub<INTHUGE || PP.II.len_lb>0 ) PP.TT.flag |= LOAD_SIZSORT+LOAD_DECROWSORT;
555
+ PROBLEM_load (&PP);
556
+ //TRSACT_print (&PP.TT, NULL, NULL);
557
+ //printf ("come\n");
558
+ internal_params.l2 = T->t;
559
+ internal_params.l3 = T->clms;
560
+
561
+ if ( PP.II.len_ub < INTHUGE ){
562
+ FLOOP (i, 0, PP.TT.T.t) if ( PP.TT.T.v[i].t <= PP.II.len_ub ){ PP.II.len_lb = i; break; }
563
+ }
564
+ if ( PP.II.itemtopk_item > 0 ) PP.II.itemtopk_end = T->clms;
565
+ PROBLEM_alloc (&PP, T->clms, T->t, 0, PP.TT.perm, PROBLEM_OCC_W +PROBLEM_VECCHR +((PP.problem&SSPC_POLISH2)?PROBLEM_ITEMARY:0));
566
+ PP.TT.perm = NULL;
567
+ realloc2 (PP.TT.w, MAX(T->t, T->clms)+1, EXIT);
568
+ ARY_FILL (PP.TT.w, 0, MAX(T->t, T->clms)+1, 1);
569
+
570
+ print_mes (&PP.TT, "separated at %d\n", PP.TT.sep);
571
+ QUEUE_delivery (PP.TT.OQ, NULL, NULL, T->v, &PP.TT.OQ[T->clms], T->t, T->clms);
572
+ PP.buf_end = 2;
573
+ if ( !ERROR_MES && PP.TT.T.clms>1 ){
574
+ if ( PP.problem & SSPC_MATRIX ){ SSPCmat (&PP); }
575
+ else SSPC (&PP);
576
+ }
577
+
578
+ ITEMSET_merge_counters (&PP.II);
579
+ internal_params.l1 = PP.II.solutions;
580
+
581
+ if ( PP.II.topk.end > 0 || PP.II.itemtopk_end > 0 ) ITEMSET_last_output (&PP.II);
582
+ else print_mes (&PP.TT, LONGF " pairs are found\n", PP.II.sc[2]);
583
+
584
+ PROBLEM_end (&PP);
585
+ return (ERROR_MES?1:0);
586
+ }
587
+
588
+ /*******************************************************************************/
589
+ #ifndef _NO_MAIN_
590
+ #define _NO_MAIN_
591
+ int main (int argc, char *argv[]){
592
+ return (SSPC_main (argc, argv) );
593
+ }
594
+ #endif
595
+ /*******************************************************************************/
596
+
597
+ #endif