apriori 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/History.txt +16 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +15 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +81 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +32 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +89 -0
  97. data/lib/apriori/version.rb +9 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +248 -0
  118. data/website/index.txt +152 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +226 -0
@@ -0,0 +1,160 @@
1
+ /*----------------------------------------------------------------------
2
+ File : istree.h
3
+ Contents: item set tree management
4
+ Author : Christian Borgelt
5
+ History : 1996.01.22 file created
6
+ 1996.01.29 ISNODE.offset and ISNODE.id added
7
+ 1996.02.08 ISTREE.tacnt, ISTREE.curr, ISTREE.index,
8
+ ISTREE.head and ISTREE.conf added
9
+ 1996.03.28 support made relative to number of item sets
10
+ 1996.11.23 ISTREE.levels (first nodes of each level) added
11
+ 1996.11.24 ISTREE.arem (add. rule evaluation measure) added
12
+ 1997.08.18 chi^2 evaluation measure added
13
+ parameter 'minlen' added to function ist_init()
14
+ 1998.02.11 parameter 'minval' added to function ist_init()
15
+ 1998.05.14 item set tree navigation functions added
16
+ 1998.08.08 parameter 'apps' added to function ist_create()
17
+ 1998.08.20 structure ISNODE redesigned
18
+ 1998.09.07 function ist_hedge added
19
+ 1998.12.08 function ist_gettac added,
20
+ float changed to double
21
+ 1999.02.05 long int changed to int
22
+ 1999.08.26 functions ist_first and ist_last added
23
+ 1999.11.05 rule evaluation measure EM_AIMP added
24
+ 1999.11.08 parameter 'aval' added to function ist_rule
25
+ 2001.04.01 functions ist_set and ist_getcntx added
26
+ 2001.12.28 sort function moved to module tract
27
+ 2002.02.07 function ist_clear removed, ist_settac added
28
+ 2002.02.11 optional use of identifier maps in nodes added
29
+ 2002.02.12 ist_first and ist_last replaced by ist_next
30
+ 2003.03.12 parameter lift added to function ist_rule
31
+ 2003.07.17 functions ist_itemcnt and ist_check added
32
+ 2003.07.18 function ist_maxfrq added (item set filter)
33
+ 2003.08.11 item set filtering generalized (ist_filter)
34
+ 2004.05.09 parameter 'aval' added to function ist_set
35
+ 2008.03.24 creation based on ITEMSET structure
36
+ ----------------------------------------------------------------------*/
37
+ #ifndef __ISTREE__
38
+ #define __ISTREE__
39
+ #include "tract.h"
40
+
41
+ /*----------------------------------------------------------------------
42
+ Preprocessor Definitions
43
+ ----------------------------------------------------------------------*/
44
+ /* --- additional evaluation measures --- */
45
+ #define EM_NONE 0 /* no measure */
46
+ #define EM_DIFF 1 /* absolute conf. difference to prior */
47
+ #define EM_QUOT 2 /* difference of conf. quotient to 1 */
48
+ #define EM_AIMP 3 /* abs. diff. of improvement to 1 */
49
+ #define EM_INFO 4 /* information difference to prior */
50
+ #define EM_CHI2 5 /* normalized chi^2 measure */
51
+ #define EM_PVAL 6 /* p-value of chi^2 measure */
52
+ #define EM_UNKNOWN 7 /* unknown measure */
53
+
54
+ /* --- item appearances --- */
55
+ #define IST_IGNORE 0 /* ignore item */
56
+ #define IST_BODY 1 /* item may appear in rule body */
57
+ #define IST_HEAD 2 /* item may appear in rule head */
58
+ #define IST_BOTH (IST_HEAD|IST_BODY)
59
+
60
+ /* --- search mode flags --- */
61
+ #define IST_MEMOPT 4 /* optimize memory usage */
62
+
63
+ /* --- item set filter modes --- */
64
+ #define IST_CLEAR 0 /* clear markers */
65
+ #define IST_CLOSED 1 /* closed item sets */
66
+ #define IST_MAXFRQ 2 /* maximal item sets */
67
+
68
+ /*----------------------------------------------------------------------
69
+ Type Definitions
70
+ ----------------------------------------------------------------------*/
71
+ typedef struct _isnode { /* --- item set node --- */
72
+ struct _isnode *parent; /* parent node */
73
+ struct _isnode *succ; /* successor node on same level */
74
+ int id; /* identifier used in parent node */
75
+ int chcnt; /* number of child nodes */
76
+ int size; /* size of counter vector */
77
+ int offset; /* offset of counter vector */
78
+ int cnts[1]; /* counter vector */
79
+ } ISNODE; /* (item set node) */
80
+
81
+ typedef struct { /* --- item set tree --- */
82
+ ITEMSET *set; /* underlying item set */
83
+ int mode; /* search mode (e.g. support def.) */
84
+ int tacnt; /* number of transactions */
85
+ int vsz; /* size of level vector */
86
+ int height; /* tree height (number of levels) */
87
+ ISNODE **lvls; /* first node of each level */
88
+ int rule; /* minimal support of an assoc. rule */
89
+ int supp; /* minimal support of an item set */
90
+ double conf; /* minimal confidence of a rule */
91
+ int arem; /* additional rule evaluation measure */
92
+ double minval; /* minimal evaluation measure value */
93
+ ISNODE *curr; /* current node for traversal */
94
+ int size; /* size of item set/rule/hyperedge */
95
+ ISNODE *node; /* item set node for extraction */
96
+ int index; /* index in item set node */
97
+ ISNODE *head; /* head item node for extraction */
98
+ int item; /* head item of previous rule */
99
+ int *buf; /* buffer for paths (support check) */
100
+ int *path; /* current path / (partial) item set */
101
+ int plen; /* current path length */
102
+ int hdonly; /* head only item in current set */
103
+ int *map; /* to create identifier maps */
104
+ #ifdef BENCH /* if benchmark version */
105
+ int sccnt; /* number of support counters */
106
+ int scnec; /* number of necessary supp. counters */
107
+ int cpcnt; /* number of child pointers */
108
+ int cpnec; /* number of necessary child pointers */
109
+ int bytes; /* number of bytes used */
110
+ #endif
111
+ } ISTREE; /* (item set tree) */
112
+
113
+ /*----------------------------------------------------------------------
114
+ Functions
115
+ ----------------------------------------------------------------------*/
116
+ extern ISTREE* ist_create (ITEMSET *set, int mode,
117
+ int supp, double conf);
118
+ extern void ist_delete (ISTREE *ist);
119
+ extern int ist_itemcnt (ISTREE *ist);
120
+
121
+ extern void ist_count (ISTREE *ist, int *set, int cnt);
122
+ extern void ist_countx (ISTREE *ist, TATREE *tat);
123
+ extern int ist_settac (ISTREE *ist, int cnt);
124
+ extern int ist_gettac (ISTREE *ist);
125
+ extern int ist_check (ISTREE *ist, char *marks);
126
+ extern int ist_addlvl (ISTREE *ist);
127
+ extern int ist_height (ISTREE *ist);
128
+
129
+ extern void ist_up (ISTREE *ist, int root);
130
+ extern int ist_down (ISTREE *ist, int item);
131
+ extern int ist_next (ISTREE *ist, int item);
132
+ extern void ist_setcnt (ISTREE *ist, int item, int cnt);
133
+ extern int ist_getcnt (ISTREE *ist, int item);
134
+ extern int ist_getcntx (ISTREE *ist, int *set, int cnt);
135
+
136
+ extern void ist_filter (ISTREE *ist, int mode);
137
+ extern void ist_init (ISTREE *ist, int minlen,
138
+ int arem, double minval);
139
+ extern int ist_set (ISTREE *ist, int *set, int *supp,
140
+ double *aval);
141
+ extern int ist_rule (ISTREE *ist, int *rule, int *supp,
142
+ double *conf, double *lift, double *aval);
143
+ extern int ist_hedge (ISTREE *ist, int *hedge, int *supp,
144
+ double *conf, double *aval);
145
+ extern int ist_group (ISTREE *ist, int *asmb, int *supp,
146
+ double *aval);
147
+
148
+ #ifndef NDEBUG
149
+ extern void ist_show (ISTREE *ist);
150
+ #endif
151
+
152
+ /*----------------------------------------------------------------------
153
+ Preprocessor Definitions
154
+ ----------------------------------------------------------------------*/
155
+ #define ist_itemcnt(t) ((t)->levels[0]->size)
156
+ #define ist_settac(t,n) ((t)->tacnt = (n))
157
+ #define ist_gettac(t) ((t)->tacnt)
158
+ #define ist_height(t) ((t)->height)
159
+
160
+ #endif
@@ -0,0 +1,105 @@
1
+ #-----------------------------------------------------------------------
2
+ # File : makefile
3
+ # Contents: build apriori program
4
+ # Author : Christian Borgelt
5
+ # History : ??.??.1995 file created
6
+ # 1997.10.13 macro ADDFLAGS added
7
+ # 1997.12.07 minor improvements
8
+ # 1998.01.04 table scanner management added
9
+ # 1999.11.11 vector operations module added
10
+ # 2000.11.04 modules vecops, symtab, and tabscan made external
11
+ # 2001.11.18 module tract (transaction management) added
12
+ # 2003.12.12 preprocessor definition ARCH64 added
13
+ #-----------------------------------------------------------------------
14
+ CC = gcc
15
+ CFBASE = -ansi -Wall -pedantic -I$(UTILDIR) -I$(MATHDIR) $(ADDFLAGS)
16
+ CFLAGS = $(CFBASE) -DNDEBUG -O3
17
+ # CFLAGS = $(CFBASE) -DNDEBUG -O3 -DBENCH
18
+ # CFLAGS = $(CFBASE) -DNDEBUG -O3 -DARCH64
19
+ # CFLAGS = $(CFBASE) -g
20
+ # CFLAGS = $(CFBASE) -g -DARCH64
21
+ # CFLAGS = $(CFBASE) -g -DSTORAGE $(ADDINC)
22
+ LDFLAGS =
23
+ LIBS = -lm
24
+ # ADDINC = -I../../misc/src
25
+ # ADDOBJ = storage.o
26
+
27
+ UTILDIR = ../../util/src
28
+ MATHDIR = ../../math/src
29
+ HDRS = $(UTILDIR)/vecops.h $(UTILDIR)/symtab.h \
30
+ $(UTILDIR)/tabscan.h $(UTILDIR)/scan.h \
31
+ $(MATHDIR)/gamma.h $(MATHDIR)/chi2.h \
32
+ tract.h istree.h
33
+ OBJS = $(UTILDIR)/vecops.o $(UTILDIR)/nimap.o \
34
+ $(UTILDIR)/tabscan.o $(UTILDIR)/scform.o \
35
+ $(MATHDIR)/gamma.o $(MATHDIR)/chi2.o \
36
+ tract.o istree.o apriori.o $(ADDOBJ)
37
+
38
+ #-----------------------------------------------------------------------
39
+ # Build Program
40
+ #-----------------------------------------------------------------------
41
+ all: apriori
42
+
43
+ apriori: $(OBJS) makefile
44
+ $(CC) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
45
+
46
+ #-----------------------------------------------------------------------
47
+ # Main Program
48
+ #-----------------------------------------------------------------------
49
+ apriori.o: tract.h istree.h $(UTILDIR)/symtab.h
50
+ apriori.o: apriori.c makefile
51
+ $(CC) $(CFLAGS) -c apriori.c -o $@
52
+
53
+ #-----------------------------------------------------------------------
54
+ # Item and Transaction Management
55
+ #-----------------------------------------------------------------------
56
+ tract.o: tract.h $(UTILDIR)/symtab.h
57
+ tract.o: tract.c makefile
58
+ $(CC) $(CFLAGS) -c tract.c -o $@
59
+
60
+ #-----------------------------------------------------------------------
61
+ # Frequent Item Set Tree Management
62
+ #-----------------------------------------------------------------------
63
+ istree.o: istree.h tract.h $(MATHDIR)/gamma.h
64
+ istree.o: istree.c makefile
65
+ $(CC) $(CFLAGS) -c istree.c -o $@
66
+
67
+ #-----------------------------------------------------------------------
68
+ # External Modules
69
+ #-----------------------------------------------------------------------
70
+ $(UTILDIR)/vecops.o:
71
+ cd $(UTILDIR); $(MAKE) vecops.o ADDFLAGS=$(ADDFLAGS)
72
+ $(UTILDIR)/nimap.o:
73
+ cd $(UTILDIR); $(MAKE) nimap.o ADDFLAGS=$(ADDFLAGS)
74
+ $(UTILDIR)/tabscan.o:
75
+ cd $(UTILDIR); $(MAKE) tabscan.o ADDFLAGS=$(ADDFLAGS)
76
+ $(UTILDIR)/scform.o:
77
+ cd $(UTILDIR); $(MAKE) scform.o ADDFLAGS=$(ADDFLAGS)
78
+ $(MATHDIR)/gamma.o:
79
+ cd $(MATHDIR); $(MAKE) gamma.o ADDFLAGS=$(ADDFLAGS)
80
+ $(MATHDIR)/chi2.o:
81
+ cd $(MATHDIR); $(MAKE) chi2.o ADDFLAGS=$(ADDFLAGS)
82
+
83
+ #-----------------------------------------------------------------------
84
+ # Storage Debugging
85
+ #-----------------------------------------------------------------------
86
+ storage.o: ../../misc/src/storage.h
87
+ storage.o: ../../misc/src/storage.c
88
+ $(CC) $(CFLAGS) -c ../../misc/src/storage.c -o $@
89
+
90
+ #-----------------------------------------------------------------------
91
+ # Install
92
+ #-----------------------------------------------------------------------
93
+ install:
94
+ cp apriori $(HOME)/bin
95
+
96
+ #-----------------------------------------------------------------------
97
+ # Clean up
98
+ #-----------------------------------------------------------------------
99
+ clean:
100
+ rm -f *.o *~ *.flc core apriori
101
+ cd $(UTILDIR); $(MAKE) clean
102
+ cd $(MATHDIR); $(MAKE) clean
103
+
104
+ localclean:
105
+ rm -f *.o *~ *.flc core apriori
@@ -0,0 +1,870 @@
1
+ /*----------------------------------------------------------------------
2
+ File : tract.c
3
+ Contents: item and transaction management
4
+ Author : Christian Borgelt
5
+ History : 1996.02.14 file created as apriori.c
6
+ 1996.06.24 function _get_item optimized
7
+ 1996.07.01 adapted to modified symtab module
8
+ 1998.01.04 scan functions moved to module 'tabscan'
9
+ 1998.06.09 vector enlargement modified
10
+ 1998.06.20 adapted to changed st_create function
11
+ 1998.08.07 bug in function _get_tract (is_read) fixed
12
+ 1998.08.08 item appearances added
13
+ 1998.08.17 item sorting and recoding added
14
+ 1998.09.02 several assertions added
15
+ 1999.02.05 long int changed to int
16
+ 1999.10.22 bug in item appearances reading fixed
17
+ 1999.11.11 adapted to name/identifier maps
18
+ 1999.12.01 check of item appearance added to sort function
19
+ 2000.03.15 removal of infrequent items added
20
+ 2001.07.14 adapted to modified module tabscan
21
+ 2001.12.27 item functions made a separate module
22
+ 2001.11.18 transaction functions made a separate module
23
+ 2001.12.28 first version of this module completed
24
+ 2002.01.12 empty field at end of record reported as error
25
+ 2002.02.06 item sorting reversed (ascending order)
26
+ 2002.02.19 transaction tree functions added
27
+ 2003.07.17 functions is_filter, ta_filter, tas_filter added
28
+ 2003.08.15 bug in function tat_delete fixed
29
+ 2003.08.21 parameter 'heap' added to tas_sort, tat_create
30
+ 2003.09.20 empty transactions in input made possible
31
+ 2003.12.18 padding for 64 bit architecture added
32
+ 2004.02.26 item frequency counting moved to is_read
33
+ 2004.11.20 function tat_mark added
34
+ 2005.06.20 function _nocmp added for neutral sorting
35
+ 2006.11.26 structures ISFMTR and ISEVAL added
36
+ 2007.02.13 adapted to modified tabscan module
37
+ 2008.01.25 bug in function ise_eval fixed (prefix)
38
+ 2008.06.30 support argument to ise_eval changed to double
39
+ ----------------------------------------------------------------------*/
40
+ #include <stdio.h>
41
+ #include <stdlib.h>
42
+ #include <string.h>
43
+ #include <limits.h>
44
+ #include <assert.h>
45
+ #include <math.h>
46
+ #include "tract.h"
47
+ #include "scan.h"
48
+ #ifdef STORAGE
49
+ #include "storage.h"
50
+ #endif
51
+
52
+ /*----------------------------------------------------------------------
53
+ Preprocessor Definitions
54
+ ----------------------------------------------------------------------*/
55
+ #define BLKSIZE 256 /* block size for enlarging vectors */
56
+
57
+ #define LN_2 0.69314718055994530942 /* ln(2) */
58
+
59
+ /*----------------------------------------------------------------------
60
+ Constants
61
+ ----------------------------------------------------------------------*/
62
+ /* --- item appearance indicators --- */
63
+ static const char *i_body[] = { /* item to appear in bodies only */
64
+ "i", "in", "a", "ante", "antecedent", "b", "body", NULL };
65
+ static const char *i_head[] = { /* item to appear in heads only */
66
+ "o", "out", "c", "cons", "consequent", "h", "head", NULL };
67
+ static const char *i_both[] = { /* item to appear in both */
68
+ "io", "inout", "ac", "bh", "both", NULL };
69
+ static const char *i_ignore[] ={/* item to ignore */
70
+ "n", "neither", "none", "ign", "ignore", "-", NULL };
71
+
72
+ /*----------------------------------------------------------------------
73
+ Auxiliary Functions
74
+ ----------------------------------------------------------------------*/
75
+
76
+ static int _appcode (const char *s)
77
+ { /* --- get appearance indicator code */
78
+ const char **p; /* to traverse indicator list */
79
+
80
+ assert(s); /* check the function argument */
81
+ for (p = i_body; *p; p++) /* check 'body' indicators */
82
+ if (strcmp(s, *p) == 0) return APP_BODY;
83
+ for (p = i_head; *p; p++) /* check 'head' indicators */
84
+ if (strcmp(s, *p) == 0) return APP_HEAD;
85
+ for (p = i_both; *p; p++) /* check 'both' indicators */
86
+ if (strcmp(s, *p) == 0) return APP_BOTH;
87
+ for (p = i_ignore; *p; p++) /* check 'ignore' indicators */
88
+ if (strcmp(s, *p) == 0) return APP_NONE;
89
+ return -1; /* if none found, return error code */
90
+ } /* _appcode() */
91
+
92
+ /*--------------------------------------------------------------------*/
93
+
94
+ static int _get_item (ITEMSET *iset, FILE *file)
95
+ { /* --- read an item */
96
+ int d; /* delimiter type */
97
+ char *buf; /* read buffer */
98
+ ITEM *item; /* pointer to item */
99
+ int *vec; /* new item vector */
100
+ int size; /* new item vector size */
101
+
102
+ assert(iset && file); /* check the function arguments */
103
+ d = ts_next(iset->tscan, file, NULL, 0);
104
+ buf = ts_buf(iset->tscan); /* read the next field (item name) */
105
+ if ((d == TS_ERR) || (buf[0] == '\0')) return d;
106
+ item = nim_byname(iset->nimap, buf);
107
+ if (!item) { /* look up the name in name/id map */
108
+ if (iset->app == APP_NONE) /* if new items are to be ignored, */
109
+ return d; /* do not register the item */
110
+ item = nim_add(iset->nimap, buf, sizeof(ITEM));
111
+ if (!item) return E_NOMEM; /* add the new item to the map, */
112
+ item->frq = item->xfq = 0; /* initialize the frequency counters */
113
+ item->app = iset->app; /* (occurrence and sum of t.a. sizes) */
114
+ } /* and set the appearance indicator */
115
+ size = iset->vsz; /* get the item vector size */
116
+ if (iset->cnt >= size) { /* if the item vector is full */
117
+ size += (size > BLKSIZE) ? (size >> 1) : BLKSIZE;
118
+ vec = (int*)realloc(iset->items, size *sizeof(int));
119
+ if (!vec) return E_NOMEM; /* enlarge the item vector */
120
+ iset->items = vec; iset->vsz = size;
121
+ } /* set the new vector and its size */
122
+ iset->items[iset->cnt++] = item->id;
123
+ return d; /* add the item to the transaction */
124
+ } /* _get_item() */ /* and return the delimiter type */
125
+
126
+ /*--------------------------------------------------------------------*/
127
+
128
+ static int _nocmp (const void *p1, const void *p2, void *data)
129
+ { /* --- compare item frequencies */
130
+ if (((const ITEM*)p1)->app == APP_NONE)
131
+ return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
132
+ if (((const ITEM*)p2)->app == APP_NONE) return -1;
133
+ #ifdef ARCH64
134
+ if (((const ITEM*)p1)->frq < (long)data)
135
+ return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
136
+ if (((const ITEM*)p2)->frq < (long)data) return -1;
137
+ #else
138
+ if (((const ITEM*)p1)->frq < (int)data)
139
+ return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
140
+ if (((const ITEM*)p2)->frq < (int)data) return -1;
141
+ #endif
142
+ if (((const ITEM*)p1)->id > ((const ITEM*)p2)->id) return 1;
143
+ if (((const ITEM*)p1)->id < ((const ITEM*)p2)->id) return -1;
144
+ return 0; /* return sign of identifier diff. */
145
+ } /* _nocmp() */
146
+
147
+ /*--------------------------------------------------------------------*/
148
+
149
+ static int _asccmp (const void *p1, const void *p2, void *data)
150
+ { /* --- compare item frequencies */
151
+ if (((const ITEM*)p1)->app == APP_NONE)
152
+ return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
153
+ if (((const ITEM*)p2)->app == APP_NONE) return -1;
154
+ #ifdef ARCH64
155
+ if (((const ITEM*)p1)->frq < (long)data)
156
+ return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
157
+ if (((const ITEM*)p2)->frq < (long)data) return -1;
158
+ #else
159
+ if (((const ITEM*)p1)->frq < (int)data)
160
+ return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
161
+ if (((const ITEM*)p2)->frq < (int)data) return -1;
162
+ #endif
163
+ if (((const ITEM*)p1)->frq > ((const ITEM*)p2)->frq) return 1;
164
+ if (((const ITEM*)p1)->frq < ((const ITEM*)p2)->frq) return -1;
165
+ return 0; /* return sign of frequency diff. */
166
+ } /* _asccmp() */
167
+
168
+ /*--------------------------------------------------------------------*/
169
+
170
+ static int _descmp (const void *p1, const void *p2, void *data)
171
+ { /* --- compare item frequencies */
172
+ if (((const ITEM*)p1)->app == APP_NONE)
173
+ return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
174
+ if (((const ITEM*)p2)->app == APP_NONE) return -1;
175
+ if (((const ITEM*)p1)->frq > ((const ITEM*)p2)->frq) return -1;
176
+ if (((const ITEM*)p1)->frq < ((const ITEM*)p2)->frq) return 1;
177
+ return 0; /* return sign of frequency diff. */
178
+ } /* _descmp() */
179
+
180
+ /*--------------------------------------------------------------------*/
181
+
182
+ static int _asccmpx (const void *p1, const void *p2, void *data)
183
+ { /* --- compare item frequencies */
184
+ if (((const ITEM*)p1)->app == APP_NONE)
185
+ return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
186
+ if (((const ITEM*)p2)->app == APP_NONE) return -1;
187
+ #ifdef ARCH64
188
+ if (((const ITEM*)p1)->frq < (long)data)
189
+ return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
190
+ if (((const ITEM*)p2)->frq < (long)data) return -1;
191
+ #else
192
+ if (((const ITEM*)p1)->frq < (int)data)
193
+ return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
194
+ if (((const ITEM*)p2)->frq < (int)data) return -1;
195
+ #endif
196
+ if (((const ITEM*)p1)->xfq > ((const ITEM*)p2)->xfq) return 1;
197
+ if (((const ITEM*)p1)->xfq < ((const ITEM*)p2)->xfq) return -1;
198
+ return 0; /* return sign of frequency diff. */
199
+ } /* _asccmpx() */
200
+
201
+ /*--------------------------------------------------------------------*/
202
+
203
+ static int _descmpx (const void *p1, const void *p2, void *data)
204
+ { /* --- compare item frequencies */
205
+ if (((const ITEM*)p1)->app == APP_NONE)
206
+ return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
207
+ if (((const ITEM*)p2)->app == APP_NONE) return -1;
208
+ #ifdef ARCH64
209
+ if (((const ITEM*)p1)->frq < (long)data)
210
+ return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
211
+ if (((const ITEM*)p2)->frq < (long)data) return -1;
212
+ #else
213
+ if (((const ITEM*)p1)->frq < (int)data)
214
+ return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
215
+ if (((const ITEM*)p2)->frq < (int)data) return -1;
216
+ #endif
217
+ if (((const ITEM*)p1)->xfq > ((const ITEM*)p2)->xfq) return -1;
218
+ if (((const ITEM*)p1)->xfq < ((const ITEM*)p2)->xfq) return 1;
219
+ return 0; /* return sign of frequency diff. */
220
+ } /* _descmpx() */
221
+
222
+ /*----------------------------------------------------------------------
223
+ Item Set Functions
224
+ ----------------------------------------------------------------------*/
225
+
226
+ ITEMSET* is_create (int cnt)
227
+ { /* --- create an item set */
228
+ ITEMSET *iset; /* created item set */
229
+
230
+ if (cnt <= 0) cnt = BLKSIZE; /* check and adapt number of items */
231
+ iset = malloc(sizeof(ITEMSET));
232
+ if (!iset) return NULL; /* create an item set */
233
+ iset->tscan = ts_create(); /* and its components */
234
+ ts_chars(iset->tscan, TS_NULL, "");
235
+ iset->nimap = nim_create(0, 0, (HASHFN*)0, (SYMFN*)0);
236
+ iset->items = (int*)malloc(cnt *sizeof(int));
237
+ if (!iset->tscan || !iset->nimap || !iset->items) {
238
+ is_delete(iset); return NULL; }
239
+ iset->tac = iset->cnt = 0; /* initialize the other fields */
240
+ iset->app = APP_BOTH;
241
+ iset->vsz = cnt;
242
+ iset->chars[0] = ' '; iset->chars[1] = ' ';
243
+ iset->chars[2] = '\n'; iset->chars[3] = '\0';
244
+ return iset; /* return the created item set */
245
+ } /* is_create() */
246
+
247
+ /*--------------------------------------------------------------------*/
248
+
249
+ void is_delete (ITEMSET *iset)
250
+ { /* --- delete an item set */
251
+ assert(iset); /* check the function argument */
252
+ if (iset->items) free(iset->items);
253
+ if (iset->nimap) nim_delete(iset->nimap);
254
+ if (iset->tscan) ts_delete(iset->tscan);
255
+ free(iset); /* delete the components */
256
+ } /* is_delete() */ /* and the item set body */
257
+
258
+ /*--------------------------------------------------------------------*/
259
+
260
+ void is_chars (ITEMSET *iset, const char *blanks, const char *fldseps,
261
+ const char *recseps, const char *comment)
262
+ { /* --- set special characters */
263
+ assert(iset); /* check the function argument */
264
+ if (blanks) /* set blank characters */
265
+ iset->chars[0] = ts_chars(iset->tscan, TS_BLANK, blanks);
266
+ if (fldseps) /* set field separators */
267
+ iset->chars[1] = ts_chars(iset->tscan, TS_FLDSEP, fldseps);
268
+ if (recseps) /* set record separators */
269
+ iset->chars[2] = ts_chars(iset->tscan, TS_RECSEP, recseps);
270
+ if (comment) /* set comment indicators */
271
+ ts_chars(iset->tscan, TS_COMMENT, comment);
272
+ } /* is_chars() */
273
+
274
+ /*--------------------------------------------------------------------*/
275
+
276
+ int is_item (ITEMSET *iset, const char *name)
277
+ { /* --- get an item identifier */
278
+ ITEM *item = nim_byname(iset->nimap, name);
279
+ return (item) ? item->id :-1; /* look up the given name */
280
+ } /* is_item() */ /* in the name/identifier map */
281
+
282
+ /*--------------------------------------------------------------------*/
283
+
284
+ int is_readapp (ITEMSET *iset, FILE *file)
285
+ { /* --- read appearance indicators */
286
+ int d; /* delimiter type */
287
+ char *buf; /* read buffer */
288
+ ITEM *item; /* to access the item data */
289
+
290
+ assert(iset && file); /* check the function arguments */
291
+ buf = ts_buf(iset->tscan); /* read the first record (one field) */
292
+ d = ts_next(iset->tscan, file, NULL, 0);
293
+ if (d == TS_ERR) return E_FREAD;
294
+ if (d != TS_REC) return E_FLDCNT;
295
+ iset->app = _appcode(buf); /* get default appearance code */
296
+ if (iset->app < 0) return E_UNKAPP;
297
+ while (1) { /* read item/indicator pairs */
298
+ d = ts_next(iset->tscan, file, NULL, 0);
299
+ if (d <= TS_EOF) /* read the next item */
300
+ return (d == TS_ERR) ? E_FREAD : 0;
301
+ if (buf[0] == '\0') /* check for end of file */
302
+ return E_ITEMEXP; /* and for a missing item */
303
+ item = nim_add(iset->nimap, buf, sizeof(ITEM));
304
+ if (item == EXISTS) return E_DUPITEM; /* add the new item */
305
+ if (item == NULL) return E_NOMEM; /* to the name/id map */
306
+ item->frq = 0; /* clear the frequency counters */
307
+ item->xfq = 0; /* (occurrence and sum of t.a. sizes) */
308
+ if (d != TS_FLD) return E_APPEXP;
309
+ d = ts_next(iset->tscan, file, NULL, 0);
310
+ if (d == TS_ERR) return E_FREAD;
311
+ if (d == TS_FLD) return E_FLDCNT;
312
+ item->app = _appcode(buf); /* get the appearance indicator */
313
+ if (item->app < 0) return E_UNKAPP;
314
+ }
315
+ return 0; /* return 'ok' */
316
+ } /* is_readapp() */
317
+
318
+ /*--------------------------------------------------------------------*/
319
+
320
+ int is_read (ITEMSET *iset, FILE *file)
321
+ { /* --- read a transaction */
322
+ int i, d; /* loop variable, delimiter type */
323
+ char *buf; /* read buffer */
324
+ ITEM *item; /* pointer to item */
325
+
326
+
327
+ assert(iset && file); /* check the function arguments */
328
+ iset->cnt = 0; /* initialize the item counter */
329
+ d = _get_item(iset, file); /* read the first item and */
330
+ buf = ts_buf(iset->tscan); /* get the read buffer */
331
+
332
+ if ((d == TS_EOF) /* if at the end of the file */
333
+ && (buf[0] == '\0')) /* and no item has been read, */
334
+ return 1; /* return 'end of file' */
335
+ while ((d == TS_FLD) /* read the other items */
336
+ && (buf[0] != '\0')) /* of the transaction */
337
+ d = _get_item(iset, file); /* up to the end of the record */
338
+ if (d == TS_ERR) return d; /* check for a read error */
339
+ if ((buf[0] == '\0') && (d == TS_FLD) && (iset->cnt > 0))
340
+ return E_ITEMEXP; /* check for an empty field */
341
+ ta_sort(iset->items, iset->cnt); /* prepare the transaction */
342
+ iset->cnt = ta_unique(iset->items, iset->cnt);
343
+ for (i = iset->cnt; --i >= 0; ) {
344
+ item = nim_byid(iset->nimap, iset->items[i]);
345
+ item->frq += 1; /* count the item and */
346
+ item->xfq += iset->cnt; /* sum the transaction sizes */
347
+ } /* as an importance indicator */
348
+ iset->tac += 1; /* count the transaction */
349
+ return 0; /* return 'ok' */
350
+ } /* is_read() */
351
+
352
+ /*--------------------------------------------------------------------*/
353
+
354
+ int is_recode (ITEMSET *iset, int minfrq, int dir, int *map)
355
+ { /* --- recode items w.r.t. frequency */
356
+ int i, k, n, t; /* loop variables, buffer */
357
+ ITEM *item; /* to traverse the items */
358
+ SYMCMPFN *cmp; /* comparison function */
359
+
360
+ assert(iset); /* check the function arguments */
361
+ if (dir > 1) cmp = _asccmpx; /* get the appropriate */
362
+ else if (dir > 0) cmp = _asccmp; /* comparison function */
363
+ else if (dir >= 0) cmp = _nocmp; /* (ascending/descending) */
364
+ else if (dir > -2) cmp = _descmp; /* and sort the items */
365
+ else cmp = _descmpx; /* w.r.t. their frequency */
366
+ nim_sort(iset->nimap, cmp, (void*)minfrq, map, 1);
367
+ for (n = nim_cnt(iset->nimap); --n >= 0; ) {
368
+ item = (ITEM*)nim_byid(iset->nimap, n);
369
+ if (item->frq < minfrq) /* determine frequent items and */
370
+ item->app = APP_NONE; /* set all others to 'ignore' */
371
+ else if (item->app != APP_NONE)
372
+ break; /* in addition, skip all items */
373
+ } /* that have been set to 'ignore' */
374
+ if (map) { /* if a map vector is provided */
375
+ for (i = k = 0; i < iset->cnt; i++) {
376
+ t = map[iset->items[i]]; /* traverse the current transaction */
377
+ if (t <= n) iset->items[k++] = t;
378
+ } /* recode all items and */
379
+ iset->cnt = k; /* delete all items to ignore */
380
+ ta_sort(iset->items, k); /* resort the items */
381
+ }
382
+ return n+1; /* return number of frequent items */
383
+ } /* is_recode() */
384
+
385
+ /*--------------------------------------------------------------------*/
386
+
387
+ int is_filter (ITEMSET *iset, const char *marks)
388
+ { /* --- filter items in transaction */
389
+ return iset->cnt = ta_filter(iset->items, iset->cnt, marks);
390
+ } /* is_filter() */
391
+
392
+ /*----------------------------------------------------------------------
393
+ Item Set Evaluation Functions
394
+ ----------------------------------------------------------------------*/
395
+
396
+ ISEVAL* ise_create (ITEMSET *iset, int tacnt)
397
+ { /* --- create an item set evaluation */
398
+ int i; /* loop variable */
399
+ ISEVAL *eval; /* created item set evaluator */
400
+
401
+ i = is_cnt(iset); /* get the number of items */
402
+ eval = (ISEVAL*)malloc(sizeof(ISEVAL) +(i+i) *sizeof(double));
403
+ if (!eval) return NULL; /* create an evaluation object */
404
+ eval->logfs = eval->lsums +i +1; /* and organize the memory */
405
+ eval->logta = log(tacnt); /* store log of number of trans. */
406
+ while (--i >= 0) /* compute logarithms of item freqs. */
407
+ eval->logfs[i] = log(is_getfrq(iset, i));
408
+ eval->lsums[0] = 0; /* init. first sum of logarithms */
409
+ return eval; /* return created item set evaluator */
410
+ } /* ise_create() */
411
+
412
+ /*--------------------------------------------------------------------*/
413
+
414
+ double ise_eval (ISEVAL *eval, int *ids, int cnt, int pfx, double supp)
415
+ { /* --- evaluate an item set */
416
+ double sum; /* sum of logarithms of frequencies */
417
+
418
+ sum = (pfx > 0) /* if there is a prefix, */
419
+ ? eval->lsums[pfx-1] : 0; /* get already known logarithm sum */
420
+ for ( ; pfx < cnt; pfx++) /* compute and add remaining terms */
421
+ eval->lsums[pfx] = sum += eval->logfs[ids[pfx]];
422
+ return (log(supp) -sum +(cnt-1) *eval->logta) * (1.0/LN_2);
423
+ } /* ise_eval() */ /* compute logarithm of quotient */
424
+
425
+ /*----------------------------------------------------------------------
426
+ Item Set Formatting Functions
427
+ ----------------------------------------------------------------------*/
428
+
429
+ ISFMTR* isf_create (ITEMSET *iset, int scan)
430
+ { /* --- create an item set formatter */
431
+ int i, k, n; /* loop variable, buffers */
432
+ int len, sum; /* length of an item name and sum */
433
+ ISFMTR *fmt; /* created item set formatter */
434
+ char buf[4*TS_SIZE+4]; /* buffer for formatting */
435
+ const char *name; /* to traverse the item names */
436
+ char *copy; /* for copies of formatted names */
437
+
438
+ n = is_cnt(iset); /* get the number of items */
439
+ fmt = (ISFMTR*)malloc(sizeof(ISFMTR) + n *sizeof(int)
440
+ +(n-1) *sizeof(char*));
441
+ if (!fmt) return NULL; /* create the base structure */
442
+ fmt->buf = NULL; /* and organize the memory */
443
+ fmt->offs = (int*)(fmt->names +n);
444
+ for (i = sum = fmt->cnt = 0; i < n; i++) {
445
+ name = is_name(iset, i); /* traverse the item names */
446
+ len = strlen(name); /* and get their length */
447
+ sum += k = (scan) ? sc_format(buf, name, 0) : len;
448
+ if (k > len) { /* if formatting was needed */
449
+ copy = (char*)malloc((k+1) *sizeof(char));
450
+ if (!copy) { fmt->cnt = i-1; isf_delete(fmt); return NULL; }
451
+ name = strcpy(copy, buf); /* copy the formatted name */
452
+ } /* into a newly created string */
453
+ fmt->names[i] = name; /* store (formatted) item name */
454
+ } /* afterwards create output buffer */
455
+ if (scan) fmt->cnt = n; /* note the number of items */
456
+ fmt->buf = (char*)malloc((sum +n +1) *sizeof(char));
457
+ if (!fmt->buf) { isf_delete(fmt); return NULL; }
458
+ fmt->offs[0] = 0; /* init. the first prefix offset */
459
+ return fmt; /* return created item set formatter */
460
+ } /* isf_create() */
461
+
462
+ /*--------------------------------------------------------------------*/
463
+
464
+ void isf_delete (ISFMTR *fmt)
465
+ { /* --- delete an item set formatter */
466
+ int i; /* loop variable */
467
+ for (i = fmt->cnt; --i >= 0; )
468
+ if ((fmt->names[i] != NULL)
469
+ && (fmt->names[i][0] == '"'))
470
+ free((void*)fmt->names[i]);
471
+ if (fmt->buf) free(fmt->buf); /* delete reformatted item names, */
472
+ free(fmt); /* the output buffer and the base */
473
+ } /* isf_delete() */
474
+
475
+ /*--------------------------------------------------------------------*/
476
+
477
+ const char* isf_format (ISFMTR *fmt, int *ids, int cnt, int pre)
478
+ { /* --- format an item set */
479
+ char *p; /* to traverse the output buffer */
480
+ const char *name; /* to traverse the item names */
481
+
482
+ p = fmt->buf +fmt->offs[pre]; /* get position for appending */
483
+ while (pre < cnt) { /* traverse the additional items */
484
+ name = fmt->names[ids[pre]];/* copy the item name to the output */
485
+ while (*name) *p++ = *name++;
486
+ *p++ = ' '; /* add an item separator */
487
+ fmt->offs[++pre] = (int)(p-fmt->buf);
488
+ } /* record the new offset */
489
+ *p = '\0'; /* terminate the formatted item set */
490
+ fmt->len = (int)(p-fmt->buf); /* note the length of the description */
491
+ return fmt->buf; /* return the output buffer */
492
+ } /* isf_format() */
493
+
494
+ /*----------------------------------------------------------------------
495
+ Transaction Functions
496
+ ----------------------------------------------------------------------*/
497
+
498
+ int ta_unique (int *items, int n)
499
+ { /* --- remove duplicate items */
500
+ int *s, *d; /* to traverse the item vector */
501
+
502
+ assert(items && (n >= 0)); /* check the function arguments */
503
+ if (n <= 1) return n; /* check for 0 or 1 item */
504
+ for (d = s = items; --n > 0;) /* traverse the sorted vector */
505
+ if (*++s != *d) *++d = *s; /* and remove duplicate items */
506
+ return (int)(++d -items); /* return the new number of items */
507
+ } /* ta_unique() */
508
+
509
+ /*--------------------------------------------------------------------*/
510
+
511
+ int ta_filter (int *items, int n, const char *marks)
512
+ { /* --- filter items in a transaction */
513
+ int i, k; /* loop variables */
514
+
515
+ assert(items && (n >= 0)); /* check the function arguments */
516
+ for (i = k = 0; i < n; i++) /* remove all unmarked items */
517
+ if (marks[items[i]]) items[k++] = items[i];
518
+ return k; /* return the new number of items */
519
+ } /* ta_filter() */
520
+
521
+ /*--------------------------------------------------------------------*/
522
+
523
+ static int ta_cmp (const void *p1, const void *p2, void *data)
524
+ { /* --- compare transactions */
525
+ int k, k1, k2; /* loop variable, counters */
526
+ const int *i1, *i2; /* to traverse the item identifiers */
527
+
528
+ assert(p1 && p2); /* check the function arguments */
529
+ i1 = ((const TRACT*)p1)->items;
530
+ i2 = ((const TRACT*)p2)->items;
531
+ k1 = ((const TRACT*)p1)->cnt; /* get the item vectors */
532
+ k2 = ((const TRACT*)p2)->cnt; /* and the numbers of items */
533
+ for (k = (k1 < k2) ? k1 : k2; --k >= 0; i1++, i2++) {
534
+ if (*i1 > *i2) return 1; /* compare corresponding items */
535
+ if (*i1 < *i2) return -1; /* and abort the comparison */
536
+ } /* if one of them is greater */
537
+ if (k1 > k2) return 1; /* if one of the transactions */
538
+ if (k1 < k2) return -1; /* is not empty, it is greater */
539
+ return 0; /* otherwise the two trans. are equal */
540
+ } /* ta_cmp() */
541
+
542
+ /*--------------------------------------------------------------------*/
543
+
544
+ static int ta_cmpx (const TRACT *ta, const int *items, int n)
545
+ { /* --- compare transactions */
546
+ int k, m; /* loop variable, counter */
547
+ const int *p; /* to traverse the item identifiers */
548
+
549
+ assert(ta && items); /* check the function arguments */
550
+ p = ta->items; m = ta->cnt; /* traverse the item vector */
551
+ m = ta->cnt;
552
+ for (k = (n < m) ? n : m; --k >= 0; p++, items++) {
553
+ if (*p > *items) return 1; /* compare corresponding items */
554
+ if (*p < *items) return -1; /* and abort the comparison */
555
+ } /* if one of them is greater */
556
+ if (m > n) return 1; /* if one of the transactions */
557
+ if (m < n) return -1; /* is not empty, it is greater */
558
+ return 0; /* otherwise the two trans. are equal */
559
+ } /* ta_cmpx() */
560
+
561
+ /*----------------------------------------------------------------------
562
+ Transaction Set Functions
563
+ ----------------------------------------------------------------------*/
564
+
565
+ TASET* tas_create (ITEMSET *itemset)
566
+ { /* --- create a transaction set */
567
+ TASET *taset; /* created transaction set */
568
+
569
+ assert(itemset); /* check the function argument */
570
+ taset = malloc(sizeof(TASET));
571
+ if (!taset) return NULL; /* create a transaction set */
572
+ taset->itemset = itemset; /* and store the item set */
573
+ taset->cnt = taset->vsz = taset->max = taset->total = 0;
574
+ taset->tracts = NULL; /* initialize the other fields */
575
+ return taset; /* return the created t.a. set */
576
+ } /* tas_create() */
577
+
578
+ /*--------------------------------------------------------------------*/
579
+
580
+ void tas_delete (TASET *taset, int delis)
581
+ { /* --- delete a transaction set */
582
+ assert(taset); /* check the function argument */
583
+ if (taset->tracts) { /* if there are loaded transactions */
584
+ while (--taset->cnt >= 0) /* traverse the transaction vector */
585
+ free(taset->tracts[taset->cnt]);
586
+ free(taset->tracts); /* delete all transactions */
587
+ } /* and the transaction vector */
588
+ if (delis && taset->itemset) is_delete(taset->itemset);
589
+ free(taset); /* delete the item set and */
590
+ } /* tas_delete() */ /* the transaction set body */
591
+
592
+ /*--------------------------------------------------------------------*/
593
+
594
+ int tas_add (TASET *taset, const int *items, int n)
595
+ { /* --- add a transaction */
596
+ TRACT *ta; /* new transaction */
597
+ int *p; /* to traverse the transaction */
598
+ TRACT **vec; /* new transaction vector */
599
+ int size; /* new transaction vector size */
600
+
601
+ assert(taset); /* check the function arguments */
602
+ size = taset->vsz; /* get the transaction vector size */
603
+ if (taset->cnt >= size) { /* if the transaction vector is full */
604
+ size += (size > BLKSIZE) ? (size >> 1) : BLKSIZE;
605
+ vec = (TRACT**)realloc(taset->tracts, size *sizeof(TRACT*));
606
+ if (!vec) return -1; /* enlarge the transaction vector */
607
+ taset->tracts = vec; taset->vsz = size;
608
+ } /* set the new vector and its size */
609
+ if (!items) { /* if no transaction is given */
610
+ items = is_tract(taset->itemset);
611
+ n = is_tsize(taset->itemset);
612
+ } /* get it from the item set */
613
+ ta = (TRACT*)malloc(sizeof(TRACT) +(n-1) *sizeof(int));
614
+ if (!ta) return -1; /* create a new transaction */
615
+ taset->tracts[taset->cnt++] = ta;
616
+ if (n > taset->max) /* store the transaction and */
617
+ taset->max = n; /* update maximal transaction size */
618
+ taset->total += n; /* sum the number of items */
619
+ for (p = ta->items +(ta->cnt = n); --n >= 0; )
620
+ *--p = items[n]; /* copy the items of the t.a. */
621
+ return 0; /* return 'ok' */
622
+ } /* tas_add() */
623
+
624
+ /*--------------------------------------------------------------------*/
625
+
626
+ void tas_recode (TASET *taset, int *map, int cnt)
627
+ { /* --- recode items */
628
+ int i, k, n, x; /* loop variables, buffer */
629
+ TRACT *t; /* to traverse the transactions */
630
+ int *p; /* to traverse the item identifiers */
631
+
632
+ assert(taset && map); /* check the function arguments */
633
+ taset->max = taset->total = 0;/* clear the maximal size and total */
634
+ for (n = taset->cnt; --n >= 0; ) {
635
+ t = taset->tracts[n]; /* traverse the transactions and */
636
+ p = t->items; /* the items of each transaction */
637
+ for (i = k = 0; i < t->cnt; i++) {
638
+ x = map[p[i]]; /* recode the items and */
639
+ if (x < cnt) p[k++] = x; /* remove superfluous items */
640
+ } /* from the transaction */
641
+ if (k > taset->max) /* update the max. transaction size */
642
+ taset->max = k; /* with the new size of the t.a. */
643
+ taset->total += k; /* sum the number of items */
644
+ ta_sort(t->items, t->cnt = k);
645
+ } /* resort the item identifiers */
646
+ } /* tas_recode() */
647
+
648
+ /*--------------------------------------------------------------------*/
649
+
650
+ int tas_filter (TASET *taset, const char *marks)
651
+ { /* --- filter items in a trans. set */
652
+ int i, max = 0; /* loop variable, max. num. of items */
653
+ TRACT *t; /* to traverse the transactions */
654
+
655
+ assert(taset && marks); /* check the function arguments */
656
+ taset->total = 0; /* clear the total number of items */
657
+ for (i = taset->cnt; --i >= 0; ) {
658
+ t = taset->tracts[i]; /* traverse the transactions */
659
+ t->cnt = ta_filter(t->items, t->cnt, marks);
660
+ if (t->cnt > max) max = t->cnt;
661
+ taset->total += t->cnt; /* filter each transaction and */
662
+ } /* update maximal size and total */
663
+ return max; /* return maximum number of items */
664
+ } /* tas_filter() */
665
+
666
+ /*--------------------------------------------------------------------*/
667
+
668
+ void tas_sort (TASET *taset, int heap)
669
+ { /* --- sort a transaction set */
670
+ assert(taset); /* check the function argument */
671
+ if (heap) v_heapsort(taset->tracts, taset->cnt, ta_cmp, NULL);
672
+ else v_sort (taset->tracts, taset->cnt, ta_cmp, NULL);
673
+ } /* tas_sort() */
674
+
675
+ /*--------------------------------------------------------------------*/
676
+
677
+ int tas_occur (TASET *taset, const int *items, int n)
678
+ { /* --- count transaction occurrences */
679
+ int l, r, m, k = taset->cnt; /* index variables */
680
+
681
+ assert(taset && items); /* check the function arguments */
682
+ for (r = m = 0; r < k; ) { /* find right boundary */
683
+ m = (r + k) >> 1; /* by a binary search */
684
+ if (ta_cmpx(taset->tracts[m], items, n) > 0) k = m;
685
+ else r = m+1;
686
+ }
687
+ for (l = m = 0; l < k; ) { /* find left boundary */
688
+ m = (l + k) >> 1; /* by a binary search */
689
+ if (ta_cmpx(taset->tracts[m], items, n) < 0) l = m+1;
690
+ else k = m;
691
+ }
692
+ return r -l; /* compute the number of occurrences */
693
+ } /* tas_occur() */
694
+
695
+ /*--------------------------------------------------------------------*/
696
+ #ifndef NDEBUG
697
+
698
+ void tas_show (TASET *taset)
699
+ { /* --- show a transaction set */
700
+ int i, k; /* loop variables */
701
+ TRACT *t; /* to traverse the transactions */
702
+
703
+ assert(taset); /* check the function argument */
704
+ for (i = 0; i < taset->cnt; i++) {
705
+ t = taset->tracts[i]; /* traverse the transactions */
706
+ for (k = 0; k < t->cnt; k++) { /* traverse the items */
707
+ if (k > 0) putc(' ', stdout); /* print a separator */
708
+ printf(is_name(taset->itemset, t->items[k]));
709
+ } /* print the next item */
710
+ putc('\n', stdout); /* terminate the transaction */
711
+ } /* finally print the number of t.a. */
712
+ printf("%d transaction(s)\n", taset->cnt);
713
+ } /* tas_show() */
714
+
715
+ #endif
716
+ /*----------------------------------------------------------------------
717
+ Transaction Tree Functions
718
+ ----------------------------------------------------------------------*/
719
+
720
+ TATREE* _create (TRACT **tracts, int cnt, int index)
721
+ { /* --- recursive part of tat_create() */
722
+ int i, k, t; /* loop variables, buffer */
723
+ int item, n; /* item and item counter */
724
+ TATREE *tat; /* created transaction tree */
725
+ TATREE **vec; /* vector of child pointers */
726
+
727
+ assert(tracts /* check the function arguments */
728
+ && (cnt >= 0) && (index >= 0));
729
+ if (cnt <= 1) { /* if only one transaction left */
730
+ n = (cnt > 0) ? (*tracts)->cnt -index : 0;
731
+ tat = (TATREE*)malloc(sizeof(TATREE) +(n-1) *sizeof(int));
732
+ if (!tat) return NULL; /* create a transaction tree node */
733
+ tat->cnt = cnt; /* and initialize its fields */
734
+ tat->size = -n;
735
+ tat->max = n;
736
+ while (--n >= 0) tat->items[n] = (*tracts)->items[index +n];
737
+ return tat;
738
+ }
739
+ for (k = cnt; (--k >= 0) && ((*tracts)->cnt <= index); )
740
+ tracts++; /* skip t.a. that are too short */
741
+ n = 0; item = -1; /* init. item and item counter */
742
+ for (tracts += i = ++k; --i >= 0; ) {
743
+ t = (*--tracts)->items[index]; /* traverse the transactions */
744
+ if (t != item) { item = t; n++; }
745
+ } /* count the different items */
746
+ #ifdef ARCH64 /* adapt to even item number */
747
+ i = (n & 1) ? n : (n+1); /* so that pointer addresses are */
748
+ #else /* multiples of 8 on 64 bit systems */
749
+ i = n; /* on 32 bit systems, however, */
750
+ #endif /* use the exact number of items */
751
+ tat = (TATREE*)malloc(sizeof(TATREE) + (i-1) *sizeof(int)
752
+ + n *sizeof(TATREE*));
753
+ if (!tat) return NULL; /* create a transaction tree node */
754
+ tat->cnt = cnt; /* and initialize its fields */
755
+ tat->size = n;
756
+ tat->max = 0;
757
+ if (n <= 0) return tat; /* if t.a. are fully captured, abort */
758
+ vec = (TATREE**)(tat->items +i);
759
+ item = tracts[--k]->items[index];
760
+ for (tracts += i = k; --i >= 0; ) {
761
+ t = (*--tracts)->items[index]; /* traverse the transactions, */
762
+ if (t == item) continue; /* but skip those with the same item */
763
+ tat->items[--n] = item; item = t;
764
+ vec[n] = _create(tracts+1, k-i, index+1);
765
+ if (!vec[n]) break; /* note the item identifier */
766
+ t = vec[n]->max +1; if (t > tat->max) tat->max = t;
767
+ k = i; /* recursively create subtrees */
768
+ } /* and adapt the section end index */
769
+ if (i < 0) { /* if child creation was successful */
770
+ tat->items[--n] = item; /* note the last item identifier */
771
+ vec[n] = _create(tracts, k+1, index+1);
772
+ if (vec[n]) { /* create the last child */
773
+ t = vec[n]->max +1; if (t > tat->max) tat->max = t;
774
+ return tat; /* return the created */
775
+ } /* transaction tree */
776
+ }
777
+ for (i = tat->size; --i > n; ) tat_delete(vec[i]);
778
+ free(tat); /* on error delete created subtrees */
779
+ return NULL; /* and the transaction tree node */
780
+ } /* _create() */
781
+
782
+ /*--------------------------------------------------------------------*/
783
+
784
+ TATREE* tat_create (TASET *taset, int heap)
785
+ { /* --- create a transactions tree */
786
+ assert(taset); /* check the function argument */
787
+ if (heap) v_heapsort(taset->tracts, taset->cnt, ta_cmp, NULL);
788
+ else v_sort (taset->tracts, taset->cnt, ta_cmp, NULL);
789
+ return _create(taset->tracts, taset->cnt, 0);
790
+ } /* tat_create() */
791
+
792
+ /*--------------------------------------------------------------------*/
793
+
794
+ void tat_delete (TATREE *tat)
795
+ { /* --- delete a transaction tree */
796
+ int i; /* loop variable */
797
+ TATREE **vec; /* vector of child nodes */
798
+
799
+ assert(tat); /* check the function argument */
800
+ #ifdef ARCH64 /* if 64 bit architecture */
801
+ i = (tat->size & 1) ? tat->size : (tat->size+1);
802
+ #else /* address must be a multiple of 8 */
803
+ i = tat->size; /* on 32 bit systems, however, */
804
+ #endif /* use the number of items directly */
805
+ vec = (TATREE**)(tat->items +i);
806
+ for (i = tat->size; --i >= 0; )
807
+ tat_delete(vec[i]); /* recursively delete the subtrees */
808
+ free(tat); /* and the tree node itself */
809
+ } /* tat_delete() */
810
+
811
+ /*--------------------------------------------------------------------*/
812
+ #ifdef ARCH64
813
+
814
+ TATREE* tat_child (TATREE *tat, int index)
815
+ { /* --- go to a child node */
816
+ int s; /* padded size of the node */
817
+
818
+ assert(tat /* check the function arguments */
819
+ && (index >= 0) && (index < tat->size));
820
+ s = (tat->size & 1) ? tat->size : (tat->size +1);
821
+ return ((TATREE**)(tat->items +s))[index];
822
+ } /* tat_child */ /* return the child node/subtree */
823
+
824
+ #endif
825
+ /*--------------------------------------------------------------------*/
826
+
827
+ void tat_mark (TATREE *tat)
828
+ { /* --- mark end of transactions */
829
+ int i; /* loop variable */
830
+
831
+ assert(tat); /* check the function argument */
832
+ if (tat->size < 0) /* if there is a transaction, */
833
+ tat->items[tat->max-1] |= INT_MIN; /* mark end of trans. */
834
+ else { /* if there are subtrees */
835
+ for (i = tat->size; --i >= 0; )
836
+ tat_mark(tat_child(tat, i));
837
+ } /* recursively mark the subtrees */
838
+ } /* tat_mark() */
839
+
840
+ /*--------------------------------------------------------------------*/
841
+ #ifndef NDEBUG
842
+
843
+ void _show (TATREE *tat, int ind)
844
+ { /* --- rekursive part of tat_show() */
845
+ int i, k; /* loop variables */
846
+ TATREE **vec; /* vector of child nodes */
847
+
848
+ assert(tat && (ind >= 0)); /* check the function arguments */
849
+ if (tat->size <= 0) { /* if this is a leaf node */
850
+ for (i = 0; i < tat->max; i++)
851
+ printf("%d ", tat->items[i] & ~INT_MIN);
852
+ printf("\n"); return; /* print the items in the */
853
+ } /* (rest of) the transaction */
854
+ vec = (TATREE**)(tat->items +tat->size);
855
+ for (i = 0; i < tat->size; i++) {
856
+ if (i > 0) for (k = ind; --k >= 0; ) printf(" ");
857
+ printf("%d ", tat->items[i]);
858
+ _show(vec[i], ind+1); /* traverse the items, print them, */
859
+ } /* and show the children recursively */
860
+ } /* _show() */
861
+
862
+ /*--------------------------------------------------------------------*/
863
+
864
+ void tat_show (TATREE *tat)
865
+ { /* --- show a transaction tree */
866
+ assert(tat); /* check the function argument */
867
+ _show(tat, 0); /* just call the recursive function */
868
+ } /* tat_show() */
869
+
870
+ #endif