apriori 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +16 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +15 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +81 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +32 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +89 -0
- data/lib/apriori/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +248 -0
- data/website/index.txt +152 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +226 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/*----------------------------------------------------------------------
|
|
2
|
+
File : istree.h
|
|
3
|
+
Contents: item set tree management
|
|
4
|
+
Author : Christian Borgelt
|
|
5
|
+
History : 1996.01.22 file created
|
|
6
|
+
1996.01.29 ISNODE.offset and ISNODE.id added
|
|
7
|
+
1996.02.08 ISTREE.tacnt, ISTREE.curr, ISTREE.index,
|
|
8
|
+
ISTREE.head and ISTREE.conf added
|
|
9
|
+
1996.03.28 support made relative to number of item sets
|
|
10
|
+
1996.11.23 ISTREE.levels (first nodes of each level) added
|
|
11
|
+
1996.11.24 ISTREE.arem (add. rule evaluation measure) added
|
|
12
|
+
1997.08.18 chi^2 evaluation measure added
|
|
13
|
+
parameter 'minlen' added to function ist_init()
|
|
14
|
+
1998.02.11 parameter 'minval' added to function ist_init()
|
|
15
|
+
1998.05.14 item set tree navigation functions added
|
|
16
|
+
1998.08.08 parameter 'apps' added to function ist_create()
|
|
17
|
+
1998.08.20 structure ISNODE redesigned
|
|
18
|
+
1998.09.07 function ist_hedge added
|
|
19
|
+
1998.12.08 function ist_gettac added,
|
|
20
|
+
float changed to double
|
|
21
|
+
1999.02.05 long int changed to int
|
|
22
|
+
1999.08.26 functions ist_first and ist_last added
|
|
23
|
+
1999.11.05 rule evaluation measure EM_AIMP added
|
|
24
|
+
1999.11.08 parameter 'aval' added to function ist_rule
|
|
25
|
+
2001.04.01 functions ist_set and ist_getcntx added
|
|
26
|
+
2001.12.28 sort function moved to module tract
|
|
27
|
+
2002.02.07 function ist_clear removed, ist_settac added
|
|
28
|
+
2002.02.11 optional use of identifier maps in nodes added
|
|
29
|
+
2002.02.12 ist_first and ist_last replaced by ist_next
|
|
30
|
+
2003.03.12 parameter lift added to function ist_rule
|
|
31
|
+
2003.07.17 functions ist_itemcnt and ist_check added
|
|
32
|
+
2003.07.18 function ist_maxfrq added (item set filter)
|
|
33
|
+
2003.08.11 item set filtering generalized (ist_filter)
|
|
34
|
+
2004.05.09 parameter 'aval' added to function ist_set
|
|
35
|
+
2008.03.24 creation based on ITEMSET structure
|
|
36
|
+
----------------------------------------------------------------------*/
|
|
37
|
+
#ifndef __ISTREE__
|
|
38
|
+
#define __ISTREE__
|
|
39
|
+
#include "tract.h"
|
|
40
|
+
|
|
41
|
+
/*----------------------------------------------------------------------
|
|
42
|
+
Preprocessor Definitions
|
|
43
|
+
----------------------------------------------------------------------*/
|
|
44
|
+
/* --- additional evaluation measures --- */
|
|
45
|
+
#define EM_NONE 0 /* no measure */
|
|
46
|
+
#define EM_DIFF 1 /* absolute conf. difference to prior */
|
|
47
|
+
#define EM_QUOT 2 /* difference of conf. quotient to 1 */
|
|
48
|
+
#define EM_AIMP 3 /* abs. diff. of improvement to 1 */
|
|
49
|
+
#define EM_INFO 4 /* information difference to prior */
|
|
50
|
+
#define EM_CHI2 5 /* normalized chi^2 measure */
|
|
51
|
+
#define EM_PVAL 6 /* p-value of chi^2 measure */
|
|
52
|
+
#define EM_UNKNOWN 7 /* unknown measure */
|
|
53
|
+
|
|
54
|
+
/* --- item appearances --- */
|
|
55
|
+
#define IST_IGNORE 0 /* ignore item */
|
|
56
|
+
#define IST_BODY 1 /* item may appear in rule body */
|
|
57
|
+
#define IST_HEAD 2 /* item may appear in rule head */
|
|
58
|
+
#define IST_BOTH (IST_HEAD|IST_BODY)
|
|
59
|
+
|
|
60
|
+
/* --- search mode flags --- */
|
|
61
|
+
#define IST_MEMOPT 4 /* optimize memory usage */
|
|
62
|
+
|
|
63
|
+
/* --- item set filter modes --- */
|
|
64
|
+
#define IST_CLEAR 0 /* clear markers */
|
|
65
|
+
#define IST_CLOSED 1 /* closed item sets */
|
|
66
|
+
#define IST_MAXFRQ 2 /* maximal item sets */
|
|
67
|
+
|
|
68
|
+
/*----------------------------------------------------------------------
|
|
69
|
+
Type Definitions
|
|
70
|
+
----------------------------------------------------------------------*/
|
|
71
|
+
typedef struct _isnode { /* --- item set node --- */
|
|
72
|
+
struct _isnode *parent; /* parent node */
|
|
73
|
+
struct _isnode *succ; /* successor node on same level */
|
|
74
|
+
int id; /* identifier used in parent node */
|
|
75
|
+
int chcnt; /* number of child nodes */
|
|
76
|
+
int size; /* size of counter vector */
|
|
77
|
+
int offset; /* offset of counter vector */
|
|
78
|
+
int cnts[1]; /* counter vector */
|
|
79
|
+
} ISNODE; /* (item set node) */
|
|
80
|
+
|
|
81
|
+
typedef struct { /* --- item set tree --- */
|
|
82
|
+
ITEMSET *set; /* underlying item set */
|
|
83
|
+
int mode; /* search mode (e.g. support def.) */
|
|
84
|
+
int tacnt; /* number of transactions */
|
|
85
|
+
int vsz; /* size of level vector */
|
|
86
|
+
int height; /* tree height (number of levels) */
|
|
87
|
+
ISNODE **lvls; /* first node of each level */
|
|
88
|
+
int rule; /* minimal support of an assoc. rule */
|
|
89
|
+
int supp; /* minimal support of an item set */
|
|
90
|
+
double conf; /* minimal confidence of a rule */
|
|
91
|
+
int arem; /* additional rule evaluation measure */
|
|
92
|
+
double minval; /* minimal evaluation measure value */
|
|
93
|
+
ISNODE *curr; /* current node for traversal */
|
|
94
|
+
int size; /* size of item set/rule/hyperedge */
|
|
95
|
+
ISNODE *node; /* item set node for extraction */
|
|
96
|
+
int index; /* index in item set node */
|
|
97
|
+
ISNODE *head; /* head item node for extraction */
|
|
98
|
+
int item; /* head item of previous rule */
|
|
99
|
+
int *buf; /* buffer for paths (support check) */
|
|
100
|
+
int *path; /* current path / (partial) item set */
|
|
101
|
+
int plen; /* current path length */
|
|
102
|
+
int hdonly; /* head only item in current set */
|
|
103
|
+
int *map; /* to create identifier maps */
|
|
104
|
+
#ifdef BENCH /* if benchmark version */
|
|
105
|
+
int sccnt; /* number of support counters */
|
|
106
|
+
int scnec; /* number of necessary supp. counters */
|
|
107
|
+
int cpcnt; /* number of child pointers */
|
|
108
|
+
int cpnec; /* number of necessary child pointers */
|
|
109
|
+
int bytes; /* number of bytes used */
|
|
110
|
+
#endif
|
|
111
|
+
} ISTREE; /* (item set tree) */
|
|
112
|
+
|
|
113
|
+
/*----------------------------------------------------------------------
|
|
114
|
+
Functions
|
|
115
|
+
----------------------------------------------------------------------*/
|
|
116
|
+
extern ISTREE* ist_create (ITEMSET *set, int mode,
|
|
117
|
+
int supp, double conf);
|
|
118
|
+
extern void ist_delete (ISTREE *ist);
|
|
119
|
+
extern int ist_itemcnt (ISTREE *ist);
|
|
120
|
+
|
|
121
|
+
extern void ist_count (ISTREE *ist, int *set, int cnt);
|
|
122
|
+
extern void ist_countx (ISTREE *ist, TATREE *tat);
|
|
123
|
+
extern int ist_settac (ISTREE *ist, int cnt);
|
|
124
|
+
extern int ist_gettac (ISTREE *ist);
|
|
125
|
+
extern int ist_check (ISTREE *ist, char *marks);
|
|
126
|
+
extern int ist_addlvl (ISTREE *ist);
|
|
127
|
+
extern int ist_height (ISTREE *ist);
|
|
128
|
+
|
|
129
|
+
extern void ist_up (ISTREE *ist, int root);
|
|
130
|
+
extern int ist_down (ISTREE *ist, int item);
|
|
131
|
+
extern int ist_next (ISTREE *ist, int item);
|
|
132
|
+
extern void ist_setcnt (ISTREE *ist, int item, int cnt);
|
|
133
|
+
extern int ist_getcnt (ISTREE *ist, int item);
|
|
134
|
+
extern int ist_getcntx (ISTREE *ist, int *set, int cnt);
|
|
135
|
+
|
|
136
|
+
extern void ist_filter (ISTREE *ist, int mode);
|
|
137
|
+
extern void ist_init (ISTREE *ist, int minlen,
|
|
138
|
+
int arem, double minval);
|
|
139
|
+
extern int ist_set (ISTREE *ist, int *set, int *supp,
|
|
140
|
+
double *aval);
|
|
141
|
+
extern int ist_rule (ISTREE *ist, int *rule, int *supp,
|
|
142
|
+
double *conf, double *lift, double *aval);
|
|
143
|
+
extern int ist_hedge (ISTREE *ist, int *hedge, int *supp,
|
|
144
|
+
double *conf, double *aval);
|
|
145
|
+
extern int ist_group (ISTREE *ist, int *asmb, int *supp,
|
|
146
|
+
double *aval);
|
|
147
|
+
|
|
148
|
+
#ifndef NDEBUG
|
|
149
|
+
extern void ist_show (ISTREE *ist);
|
|
150
|
+
#endif
|
|
151
|
+
|
|
152
|
+
/*----------------------------------------------------------------------
|
|
153
|
+
Preprocessor Definitions
|
|
154
|
+
----------------------------------------------------------------------*/
|
|
155
|
+
#define ist_itemcnt(t) ((t)->levels[0]->size)
|
|
156
|
+
#define ist_settac(t,n) ((t)->tacnt = (n))
|
|
157
|
+
#define ist_gettac(t) ((t)->tacnt)
|
|
158
|
+
#define ist_height(t) ((t)->height)
|
|
159
|
+
|
|
160
|
+
#endif
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#-----------------------------------------------------------------------
|
|
2
|
+
# File : makefile
|
|
3
|
+
# Contents: build apriori program
|
|
4
|
+
# Author : Christian Borgelt
|
|
5
|
+
# History : ??.??.1995 file created
|
|
6
|
+
# 1997.10.13 macro ADDFLAGS added
|
|
7
|
+
# 1997.12.07 minor improvements
|
|
8
|
+
# 1998.01.04 table scanner management added
|
|
9
|
+
# 1999.11.11 vector operations module added
|
|
10
|
+
# 2000.11.04 modules vecops, symtab, and tabscan made external
|
|
11
|
+
# 2001.11.18 module tract (transaction management) added
|
|
12
|
+
# 2003.12.12 preprocessor definition ARCH64 added
|
|
13
|
+
#-----------------------------------------------------------------------
|
|
14
|
+
CC = gcc
|
|
15
|
+
CFBASE = -ansi -Wall -pedantic -I$(UTILDIR) -I$(MATHDIR) $(ADDFLAGS)
|
|
16
|
+
CFLAGS = $(CFBASE) -DNDEBUG -O3
|
|
17
|
+
# CFLAGS = $(CFBASE) -DNDEBUG -O3 -DBENCH
|
|
18
|
+
# CFLAGS = $(CFBASE) -DNDEBUG -O3 -DARCH64
|
|
19
|
+
# CFLAGS = $(CFBASE) -g
|
|
20
|
+
# CFLAGS = $(CFBASE) -g -DARCH64
|
|
21
|
+
# CFLAGS = $(CFBASE) -g -DSTORAGE $(ADDINC)
|
|
22
|
+
LDFLAGS =
|
|
23
|
+
LIBS = -lm
|
|
24
|
+
# ADDINC = -I../../misc/src
|
|
25
|
+
# ADDOBJ = storage.o
|
|
26
|
+
|
|
27
|
+
UTILDIR = ../../util/src
|
|
28
|
+
MATHDIR = ../../math/src
|
|
29
|
+
HDRS = $(UTILDIR)/vecops.h $(UTILDIR)/symtab.h \
|
|
30
|
+
$(UTILDIR)/tabscan.h $(UTILDIR)/scan.h \
|
|
31
|
+
$(MATHDIR)/gamma.h $(MATHDIR)/chi2.h \
|
|
32
|
+
tract.h istree.h
|
|
33
|
+
OBJS = $(UTILDIR)/vecops.o $(UTILDIR)/nimap.o \
|
|
34
|
+
$(UTILDIR)/tabscan.o $(UTILDIR)/scform.o \
|
|
35
|
+
$(MATHDIR)/gamma.o $(MATHDIR)/chi2.o \
|
|
36
|
+
tract.o istree.o apriori.o $(ADDOBJ)
|
|
37
|
+
|
|
38
|
+
#-----------------------------------------------------------------------
|
|
39
|
+
# Build Program
|
|
40
|
+
#-----------------------------------------------------------------------
|
|
41
|
+
all: apriori
|
|
42
|
+
|
|
43
|
+
apriori: $(OBJS) makefile
|
|
44
|
+
$(CC) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
|
|
45
|
+
|
|
46
|
+
#-----------------------------------------------------------------------
|
|
47
|
+
# Main Program
|
|
48
|
+
#-----------------------------------------------------------------------
|
|
49
|
+
apriori.o: tract.h istree.h $(UTILDIR)/symtab.h
|
|
50
|
+
apriori.o: apriori.c makefile
|
|
51
|
+
$(CC) $(CFLAGS) -c apriori.c -o $@
|
|
52
|
+
|
|
53
|
+
#-----------------------------------------------------------------------
|
|
54
|
+
# Item and Transaction Management
|
|
55
|
+
#-----------------------------------------------------------------------
|
|
56
|
+
tract.o: tract.h $(UTILDIR)/symtab.h
|
|
57
|
+
tract.o: tract.c makefile
|
|
58
|
+
$(CC) $(CFLAGS) -c tract.c -o $@
|
|
59
|
+
|
|
60
|
+
#-----------------------------------------------------------------------
|
|
61
|
+
# Frequent Item Set Tree Management
|
|
62
|
+
#-----------------------------------------------------------------------
|
|
63
|
+
istree.o: istree.h tract.h $(MATHDIR)/gamma.h
|
|
64
|
+
istree.o: istree.c makefile
|
|
65
|
+
$(CC) $(CFLAGS) -c istree.c -o $@
|
|
66
|
+
|
|
67
|
+
#-----------------------------------------------------------------------
|
|
68
|
+
# External Modules
|
|
69
|
+
#-----------------------------------------------------------------------
|
|
70
|
+
$(UTILDIR)/vecops.o:
|
|
71
|
+
cd $(UTILDIR); $(MAKE) vecops.o ADDFLAGS=$(ADDFLAGS)
|
|
72
|
+
$(UTILDIR)/nimap.o:
|
|
73
|
+
cd $(UTILDIR); $(MAKE) nimap.o ADDFLAGS=$(ADDFLAGS)
|
|
74
|
+
$(UTILDIR)/tabscan.o:
|
|
75
|
+
cd $(UTILDIR); $(MAKE) tabscan.o ADDFLAGS=$(ADDFLAGS)
|
|
76
|
+
$(UTILDIR)/scform.o:
|
|
77
|
+
cd $(UTILDIR); $(MAKE) scform.o ADDFLAGS=$(ADDFLAGS)
|
|
78
|
+
$(MATHDIR)/gamma.o:
|
|
79
|
+
cd $(MATHDIR); $(MAKE) gamma.o ADDFLAGS=$(ADDFLAGS)
|
|
80
|
+
$(MATHDIR)/chi2.o:
|
|
81
|
+
cd $(MATHDIR); $(MAKE) chi2.o ADDFLAGS=$(ADDFLAGS)
|
|
82
|
+
|
|
83
|
+
#-----------------------------------------------------------------------
|
|
84
|
+
# Storage Debugging
|
|
85
|
+
#-----------------------------------------------------------------------
|
|
86
|
+
storage.o: ../../misc/src/storage.h
|
|
87
|
+
storage.o: ../../misc/src/storage.c
|
|
88
|
+
$(CC) $(CFLAGS) -c ../../misc/src/storage.c -o $@
|
|
89
|
+
|
|
90
|
+
#-----------------------------------------------------------------------
|
|
91
|
+
# Install
|
|
92
|
+
#-----------------------------------------------------------------------
|
|
93
|
+
install:
|
|
94
|
+
cp apriori $(HOME)/bin
|
|
95
|
+
|
|
96
|
+
#-----------------------------------------------------------------------
|
|
97
|
+
# Clean up
|
|
98
|
+
#-----------------------------------------------------------------------
|
|
99
|
+
clean:
|
|
100
|
+
rm -f *.o *~ *.flc core apriori
|
|
101
|
+
cd $(UTILDIR); $(MAKE) clean
|
|
102
|
+
cd $(MATHDIR); $(MAKE) clean
|
|
103
|
+
|
|
104
|
+
localclean:
|
|
105
|
+
rm -f *.o *~ *.flc core apriori
|
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
/*----------------------------------------------------------------------
|
|
2
|
+
File : tract.c
|
|
3
|
+
Contents: item and transaction management
|
|
4
|
+
Author : Christian Borgelt
|
|
5
|
+
History : 1996.02.14 file created as apriori.c
|
|
6
|
+
1996.06.24 function _get_item optimized
|
|
7
|
+
1996.07.01 adapted to modified symtab module
|
|
8
|
+
1998.01.04 scan functions moved to module 'tabscan'
|
|
9
|
+
1998.06.09 vector enlargement modified
|
|
10
|
+
1998.06.20 adapted to changed st_create function
|
|
11
|
+
1998.08.07 bug in function _get_tract (is_read) fixed
|
|
12
|
+
1998.08.08 item appearances added
|
|
13
|
+
1998.08.17 item sorting and recoding added
|
|
14
|
+
1998.09.02 several assertions added
|
|
15
|
+
1999.02.05 long int changed to int
|
|
16
|
+
1999.10.22 bug in item appearances reading fixed
|
|
17
|
+
1999.11.11 adapted to name/identifier maps
|
|
18
|
+
1999.12.01 check of item appearance added to sort function
|
|
19
|
+
2000.03.15 removal of infrequent items added
|
|
20
|
+
2001.07.14 adapted to modified module tabscan
|
|
21
|
+
2001.12.27 item functions made a separate module
|
|
22
|
+
2001.11.18 transaction functions made a separate module
|
|
23
|
+
2001.12.28 first version of this module completed
|
|
24
|
+
2002.01.12 empty field at end of record reported as error
|
|
25
|
+
2002.02.06 item sorting reversed (ascending order)
|
|
26
|
+
2002.02.19 transaction tree functions added
|
|
27
|
+
2003.07.17 functions is_filter, ta_filter, tas_filter added
|
|
28
|
+
2003.08.15 bug in function tat_delete fixed
|
|
29
|
+
2003.08.21 parameter 'heap' added to tas_sort, tat_create
|
|
30
|
+
2003.09.20 empty transactions in input made possible
|
|
31
|
+
2003.12.18 padding for 64 bit architecture added
|
|
32
|
+
2004.02.26 item frequency counting moved to is_read
|
|
33
|
+
2004.11.20 function tat_mark added
|
|
34
|
+
2005.06.20 function _nocmp added for neutral sorting
|
|
35
|
+
2006.11.26 structures ISFMTR and ISEVAL added
|
|
36
|
+
2007.02.13 adapted to modified tabscan module
|
|
37
|
+
2008.01.25 bug in function ise_eval fixed (prefix)
|
|
38
|
+
2008.06.30 support argument to ise_eval changed to double
|
|
39
|
+
----------------------------------------------------------------------*/
|
|
40
|
+
#include <stdio.h>
|
|
41
|
+
#include <stdlib.h>
|
|
42
|
+
#include <string.h>
|
|
43
|
+
#include <limits.h>
|
|
44
|
+
#include <assert.h>
|
|
45
|
+
#include <math.h>
|
|
46
|
+
#include "tract.h"
|
|
47
|
+
#include "scan.h"
|
|
48
|
+
#ifdef STORAGE
|
|
49
|
+
#include "storage.h"
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
/*----------------------------------------------------------------------
|
|
53
|
+
Preprocessor Definitions
|
|
54
|
+
----------------------------------------------------------------------*/
|
|
55
|
+
#define BLKSIZE 256 /* block size for enlarging vectors */
|
|
56
|
+
|
|
57
|
+
#define LN_2 0.69314718055994530942 /* ln(2) */
|
|
58
|
+
|
|
59
|
+
/*----------------------------------------------------------------------
|
|
60
|
+
Constants
|
|
61
|
+
----------------------------------------------------------------------*/
|
|
62
|
+
/* --- item appearance indicators --- */
|
|
63
|
+
static const char *i_body[] = { /* item to appear in bodies only */
|
|
64
|
+
"i", "in", "a", "ante", "antecedent", "b", "body", NULL };
|
|
65
|
+
static const char *i_head[] = { /* item to appear in heads only */
|
|
66
|
+
"o", "out", "c", "cons", "consequent", "h", "head", NULL };
|
|
67
|
+
static const char *i_both[] = { /* item to appear in both */
|
|
68
|
+
"io", "inout", "ac", "bh", "both", NULL };
|
|
69
|
+
static const char *i_ignore[] ={/* item to ignore */
|
|
70
|
+
"n", "neither", "none", "ign", "ignore", "-", NULL };
|
|
71
|
+
|
|
72
|
+
/*----------------------------------------------------------------------
|
|
73
|
+
Auxiliary Functions
|
|
74
|
+
----------------------------------------------------------------------*/
|
|
75
|
+
|
|
76
|
+
static int _appcode (const char *s)
|
|
77
|
+
{ /* --- get appearance indicator code */
|
|
78
|
+
const char **p; /* to traverse indicator list */
|
|
79
|
+
|
|
80
|
+
assert(s); /* check the function argument */
|
|
81
|
+
for (p = i_body; *p; p++) /* check 'body' indicators */
|
|
82
|
+
if (strcmp(s, *p) == 0) return APP_BODY;
|
|
83
|
+
for (p = i_head; *p; p++) /* check 'head' indicators */
|
|
84
|
+
if (strcmp(s, *p) == 0) return APP_HEAD;
|
|
85
|
+
for (p = i_both; *p; p++) /* check 'both' indicators */
|
|
86
|
+
if (strcmp(s, *p) == 0) return APP_BOTH;
|
|
87
|
+
for (p = i_ignore; *p; p++) /* check 'ignore' indicators */
|
|
88
|
+
if (strcmp(s, *p) == 0) return APP_NONE;
|
|
89
|
+
return -1; /* if none found, return error code */
|
|
90
|
+
} /* _appcode() */
|
|
91
|
+
|
|
92
|
+
/*--------------------------------------------------------------------*/
|
|
93
|
+
|
|
94
|
+
static int _get_item (ITEMSET *iset, FILE *file)
|
|
95
|
+
{ /* --- read an item */
|
|
96
|
+
int d; /* delimiter type */
|
|
97
|
+
char *buf; /* read buffer */
|
|
98
|
+
ITEM *item; /* pointer to item */
|
|
99
|
+
int *vec; /* new item vector */
|
|
100
|
+
int size; /* new item vector size */
|
|
101
|
+
|
|
102
|
+
assert(iset && file); /* check the function arguments */
|
|
103
|
+
d = ts_next(iset->tscan, file, NULL, 0);
|
|
104
|
+
buf = ts_buf(iset->tscan); /* read the next field (item name) */
|
|
105
|
+
if ((d == TS_ERR) || (buf[0] == '\0')) return d;
|
|
106
|
+
item = nim_byname(iset->nimap, buf);
|
|
107
|
+
if (!item) { /* look up the name in name/id map */
|
|
108
|
+
if (iset->app == APP_NONE) /* if new items are to be ignored, */
|
|
109
|
+
return d; /* do not register the item */
|
|
110
|
+
item = nim_add(iset->nimap, buf, sizeof(ITEM));
|
|
111
|
+
if (!item) return E_NOMEM; /* add the new item to the map, */
|
|
112
|
+
item->frq = item->xfq = 0; /* initialize the frequency counters */
|
|
113
|
+
item->app = iset->app; /* (occurrence and sum of t.a. sizes) */
|
|
114
|
+
} /* and set the appearance indicator */
|
|
115
|
+
size = iset->vsz; /* get the item vector size */
|
|
116
|
+
if (iset->cnt >= size) { /* if the item vector is full */
|
|
117
|
+
size += (size > BLKSIZE) ? (size >> 1) : BLKSIZE;
|
|
118
|
+
vec = (int*)realloc(iset->items, size *sizeof(int));
|
|
119
|
+
if (!vec) return E_NOMEM; /* enlarge the item vector */
|
|
120
|
+
iset->items = vec; iset->vsz = size;
|
|
121
|
+
} /* set the new vector and its size */
|
|
122
|
+
iset->items[iset->cnt++] = item->id;
|
|
123
|
+
return d; /* add the item to the transaction */
|
|
124
|
+
} /* _get_item() */ /* and return the delimiter type */
|
|
125
|
+
|
|
126
|
+
/*--------------------------------------------------------------------*/
|
|
127
|
+
|
|
128
|
+
static int _nocmp (const void *p1, const void *p2, void *data)
|
|
129
|
+
{ /* --- compare item frequencies */
|
|
130
|
+
if (((const ITEM*)p1)->app == APP_NONE)
|
|
131
|
+
return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
|
|
132
|
+
if (((const ITEM*)p2)->app == APP_NONE) return -1;
|
|
133
|
+
#ifdef ARCH64
|
|
134
|
+
if (((const ITEM*)p1)->frq < (long)data)
|
|
135
|
+
return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
|
|
136
|
+
if (((const ITEM*)p2)->frq < (long)data) return -1;
|
|
137
|
+
#else
|
|
138
|
+
if (((const ITEM*)p1)->frq < (int)data)
|
|
139
|
+
return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
|
|
140
|
+
if (((const ITEM*)p2)->frq < (int)data) return -1;
|
|
141
|
+
#endif
|
|
142
|
+
if (((const ITEM*)p1)->id > ((const ITEM*)p2)->id) return 1;
|
|
143
|
+
if (((const ITEM*)p1)->id < ((const ITEM*)p2)->id) return -1;
|
|
144
|
+
return 0; /* return sign of identifier diff. */
|
|
145
|
+
} /* _nocmp() */
|
|
146
|
+
|
|
147
|
+
/*--------------------------------------------------------------------*/
|
|
148
|
+
|
|
149
|
+
static int _asccmp (const void *p1, const void *p2, void *data)
|
|
150
|
+
{ /* --- compare item frequencies */
|
|
151
|
+
if (((const ITEM*)p1)->app == APP_NONE)
|
|
152
|
+
return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
|
|
153
|
+
if (((const ITEM*)p2)->app == APP_NONE) return -1;
|
|
154
|
+
#ifdef ARCH64
|
|
155
|
+
if (((const ITEM*)p1)->frq < (long)data)
|
|
156
|
+
return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
|
|
157
|
+
if (((const ITEM*)p2)->frq < (long)data) return -1;
|
|
158
|
+
#else
|
|
159
|
+
if (((const ITEM*)p1)->frq < (int)data)
|
|
160
|
+
return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
|
|
161
|
+
if (((const ITEM*)p2)->frq < (int)data) return -1;
|
|
162
|
+
#endif
|
|
163
|
+
if (((const ITEM*)p1)->frq > ((const ITEM*)p2)->frq) return 1;
|
|
164
|
+
if (((const ITEM*)p1)->frq < ((const ITEM*)p2)->frq) return -1;
|
|
165
|
+
return 0; /* return sign of frequency diff. */
|
|
166
|
+
} /* _asccmp() */
|
|
167
|
+
|
|
168
|
+
/*--------------------------------------------------------------------*/
|
|
169
|
+
|
|
170
|
+
static int _descmp (const void *p1, const void *p2, void *data)
|
|
171
|
+
{ /* --- compare item frequencies */
|
|
172
|
+
if (((const ITEM*)p1)->app == APP_NONE)
|
|
173
|
+
return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
|
|
174
|
+
if (((const ITEM*)p2)->app == APP_NONE) return -1;
|
|
175
|
+
if (((const ITEM*)p1)->frq > ((const ITEM*)p2)->frq) return -1;
|
|
176
|
+
if (((const ITEM*)p1)->frq < ((const ITEM*)p2)->frq) return 1;
|
|
177
|
+
return 0; /* return sign of frequency diff. */
|
|
178
|
+
} /* _descmp() */
|
|
179
|
+
|
|
180
|
+
/*--------------------------------------------------------------------*/
|
|
181
|
+
|
|
182
|
+
static int _asccmpx (const void *p1, const void *p2, void *data)
|
|
183
|
+
{ /* --- compare item frequencies */
|
|
184
|
+
if (((const ITEM*)p1)->app == APP_NONE)
|
|
185
|
+
return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
|
|
186
|
+
if (((const ITEM*)p2)->app == APP_NONE) return -1;
|
|
187
|
+
#ifdef ARCH64
|
|
188
|
+
if (((const ITEM*)p1)->frq < (long)data)
|
|
189
|
+
return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
|
|
190
|
+
if (((const ITEM*)p2)->frq < (long)data) return -1;
|
|
191
|
+
#else
|
|
192
|
+
if (((const ITEM*)p1)->frq < (int)data)
|
|
193
|
+
return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
|
|
194
|
+
if (((const ITEM*)p2)->frq < (int)data) return -1;
|
|
195
|
+
#endif
|
|
196
|
+
if (((const ITEM*)p1)->xfq > ((const ITEM*)p2)->xfq) return 1;
|
|
197
|
+
if (((const ITEM*)p1)->xfq < ((const ITEM*)p2)->xfq) return -1;
|
|
198
|
+
return 0; /* return sign of frequency diff. */
|
|
199
|
+
} /* _asccmpx() */
|
|
200
|
+
|
|
201
|
+
/*--------------------------------------------------------------------*/
|
|
202
|
+
|
|
203
|
+
static int _descmpx (const void *p1, const void *p2, void *data)
|
|
204
|
+
{ /* --- compare item frequencies */
|
|
205
|
+
if (((const ITEM*)p1)->app == APP_NONE)
|
|
206
|
+
return (((const ITEM*)p2)->app == APP_NONE) ? 0 : 1;
|
|
207
|
+
if (((const ITEM*)p2)->app == APP_NONE) return -1;
|
|
208
|
+
#ifdef ARCH64
|
|
209
|
+
if (((const ITEM*)p1)->frq < (long)data)
|
|
210
|
+
return (((const ITEM*)p2)->frq < (long)data) ? 0 : 1;
|
|
211
|
+
if (((const ITEM*)p2)->frq < (long)data) return -1;
|
|
212
|
+
#else
|
|
213
|
+
if (((const ITEM*)p1)->frq < (int)data)
|
|
214
|
+
return (((const ITEM*)p2)->frq < (int)data) ? 0 : 1;
|
|
215
|
+
if (((const ITEM*)p2)->frq < (int)data) return -1;
|
|
216
|
+
#endif
|
|
217
|
+
if (((const ITEM*)p1)->xfq > ((const ITEM*)p2)->xfq) return -1;
|
|
218
|
+
if (((const ITEM*)p1)->xfq < ((const ITEM*)p2)->xfq) return 1;
|
|
219
|
+
return 0; /* return sign of frequency diff. */
|
|
220
|
+
} /* _descmpx() */
|
|
221
|
+
|
|
222
|
+
/*----------------------------------------------------------------------
|
|
223
|
+
Item Set Functions
|
|
224
|
+
----------------------------------------------------------------------*/
|
|
225
|
+
|
|
226
|
+
ITEMSET* is_create (int cnt)
|
|
227
|
+
{ /* --- create an item set */
|
|
228
|
+
ITEMSET *iset; /* created item set */
|
|
229
|
+
|
|
230
|
+
if (cnt <= 0) cnt = BLKSIZE; /* check and adapt number of items */
|
|
231
|
+
iset = malloc(sizeof(ITEMSET));
|
|
232
|
+
if (!iset) return NULL; /* create an item set */
|
|
233
|
+
iset->tscan = ts_create(); /* and its components */
|
|
234
|
+
ts_chars(iset->tscan, TS_NULL, "");
|
|
235
|
+
iset->nimap = nim_create(0, 0, (HASHFN*)0, (SYMFN*)0);
|
|
236
|
+
iset->items = (int*)malloc(cnt *sizeof(int));
|
|
237
|
+
if (!iset->tscan || !iset->nimap || !iset->items) {
|
|
238
|
+
is_delete(iset); return NULL; }
|
|
239
|
+
iset->tac = iset->cnt = 0; /* initialize the other fields */
|
|
240
|
+
iset->app = APP_BOTH;
|
|
241
|
+
iset->vsz = cnt;
|
|
242
|
+
iset->chars[0] = ' '; iset->chars[1] = ' ';
|
|
243
|
+
iset->chars[2] = '\n'; iset->chars[3] = '\0';
|
|
244
|
+
return iset; /* return the created item set */
|
|
245
|
+
} /* is_create() */
|
|
246
|
+
|
|
247
|
+
/*--------------------------------------------------------------------*/
|
|
248
|
+
|
|
249
|
+
void is_delete (ITEMSET *iset)
|
|
250
|
+
{ /* --- delete an item set */
|
|
251
|
+
assert(iset); /* check the function argument */
|
|
252
|
+
if (iset->items) free(iset->items);
|
|
253
|
+
if (iset->nimap) nim_delete(iset->nimap);
|
|
254
|
+
if (iset->tscan) ts_delete(iset->tscan);
|
|
255
|
+
free(iset); /* delete the components */
|
|
256
|
+
} /* is_delete() */ /* and the item set body */
|
|
257
|
+
|
|
258
|
+
/*--------------------------------------------------------------------*/
|
|
259
|
+
|
|
260
|
+
void is_chars (ITEMSET *iset, const char *blanks, const char *fldseps,
|
|
261
|
+
const char *recseps, const char *comment)
|
|
262
|
+
{ /* --- set special characters */
|
|
263
|
+
assert(iset); /* check the function argument */
|
|
264
|
+
if (blanks) /* set blank characters */
|
|
265
|
+
iset->chars[0] = ts_chars(iset->tscan, TS_BLANK, blanks);
|
|
266
|
+
if (fldseps) /* set field separators */
|
|
267
|
+
iset->chars[1] = ts_chars(iset->tscan, TS_FLDSEP, fldseps);
|
|
268
|
+
if (recseps) /* set record separators */
|
|
269
|
+
iset->chars[2] = ts_chars(iset->tscan, TS_RECSEP, recseps);
|
|
270
|
+
if (comment) /* set comment indicators */
|
|
271
|
+
ts_chars(iset->tscan, TS_COMMENT, comment);
|
|
272
|
+
} /* is_chars() */
|
|
273
|
+
|
|
274
|
+
/*--------------------------------------------------------------------*/
|
|
275
|
+
|
|
276
|
+
int is_item (ITEMSET *iset, const char *name)
|
|
277
|
+
{ /* --- get an item identifier */
|
|
278
|
+
ITEM *item = nim_byname(iset->nimap, name);
|
|
279
|
+
return (item) ? item->id :-1; /* look up the given name */
|
|
280
|
+
} /* is_item() */ /* in the name/identifier map */
|
|
281
|
+
|
|
282
|
+
/*--------------------------------------------------------------------*/
|
|
283
|
+
|
|
284
|
+
int is_readapp (ITEMSET *iset, FILE *file)
|
|
285
|
+
{ /* --- read appearance indicators */
|
|
286
|
+
int d; /* delimiter type */
|
|
287
|
+
char *buf; /* read buffer */
|
|
288
|
+
ITEM *item; /* to access the item data */
|
|
289
|
+
|
|
290
|
+
assert(iset && file); /* check the function arguments */
|
|
291
|
+
buf = ts_buf(iset->tscan); /* read the first record (one field) */
|
|
292
|
+
d = ts_next(iset->tscan, file, NULL, 0);
|
|
293
|
+
if (d == TS_ERR) return E_FREAD;
|
|
294
|
+
if (d != TS_REC) return E_FLDCNT;
|
|
295
|
+
iset->app = _appcode(buf); /* get default appearance code */
|
|
296
|
+
if (iset->app < 0) return E_UNKAPP;
|
|
297
|
+
while (1) { /* read item/indicator pairs */
|
|
298
|
+
d = ts_next(iset->tscan, file, NULL, 0);
|
|
299
|
+
if (d <= TS_EOF) /* read the next item */
|
|
300
|
+
return (d == TS_ERR) ? E_FREAD : 0;
|
|
301
|
+
if (buf[0] == '\0') /* check for end of file */
|
|
302
|
+
return E_ITEMEXP; /* and for a missing item */
|
|
303
|
+
item = nim_add(iset->nimap, buf, sizeof(ITEM));
|
|
304
|
+
if (item == EXISTS) return E_DUPITEM; /* add the new item */
|
|
305
|
+
if (item == NULL) return E_NOMEM; /* to the name/id map */
|
|
306
|
+
item->frq = 0; /* clear the frequency counters */
|
|
307
|
+
item->xfq = 0; /* (occurrence and sum of t.a. sizes) */
|
|
308
|
+
if (d != TS_FLD) return E_APPEXP;
|
|
309
|
+
d = ts_next(iset->tscan, file, NULL, 0);
|
|
310
|
+
if (d == TS_ERR) return E_FREAD;
|
|
311
|
+
if (d == TS_FLD) return E_FLDCNT;
|
|
312
|
+
item->app = _appcode(buf); /* get the appearance indicator */
|
|
313
|
+
if (item->app < 0) return E_UNKAPP;
|
|
314
|
+
}
|
|
315
|
+
return 0; /* return 'ok' */
|
|
316
|
+
} /* is_readapp() */
|
|
317
|
+
|
|
318
|
+
/*--------------------------------------------------------------------*/
|
|
319
|
+
|
|
320
|
+
int is_read (ITEMSET *iset, FILE *file)
|
|
321
|
+
{ /* --- read a transaction */
|
|
322
|
+
int i, d; /* loop variable, delimiter type */
|
|
323
|
+
char *buf; /* read buffer */
|
|
324
|
+
ITEM *item; /* pointer to item */
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
assert(iset && file); /* check the function arguments */
|
|
328
|
+
iset->cnt = 0; /* initialize the item counter */
|
|
329
|
+
d = _get_item(iset, file); /* read the first item and */
|
|
330
|
+
buf = ts_buf(iset->tscan); /* get the read buffer */
|
|
331
|
+
|
|
332
|
+
if ((d == TS_EOF) /* if at the end of the file */
|
|
333
|
+
&& (buf[0] == '\0')) /* and no item has been read, */
|
|
334
|
+
return 1; /* return 'end of file' */
|
|
335
|
+
while ((d == TS_FLD) /* read the other items */
|
|
336
|
+
&& (buf[0] != '\0')) /* of the transaction */
|
|
337
|
+
d = _get_item(iset, file); /* up to the end of the record */
|
|
338
|
+
if (d == TS_ERR) return d; /* check for a read error */
|
|
339
|
+
if ((buf[0] == '\0') && (d == TS_FLD) && (iset->cnt > 0))
|
|
340
|
+
return E_ITEMEXP; /* check for an empty field */
|
|
341
|
+
ta_sort(iset->items, iset->cnt); /* prepare the transaction */
|
|
342
|
+
iset->cnt = ta_unique(iset->items, iset->cnt);
|
|
343
|
+
for (i = iset->cnt; --i >= 0; ) {
|
|
344
|
+
item = nim_byid(iset->nimap, iset->items[i]);
|
|
345
|
+
item->frq += 1; /* count the item and */
|
|
346
|
+
item->xfq += iset->cnt; /* sum the transaction sizes */
|
|
347
|
+
} /* as an importance indicator */
|
|
348
|
+
iset->tac += 1; /* count the transaction */
|
|
349
|
+
return 0; /* return 'ok' */
|
|
350
|
+
} /* is_read() */
|
|
351
|
+
|
|
352
|
+
/*--------------------------------------------------------------------*/
|
|
353
|
+
|
|
354
|
+
int is_recode (ITEMSET *iset, int minfrq, int dir, int *map)
|
|
355
|
+
{ /* --- recode items w.r.t. frequency */
|
|
356
|
+
int i, k, n, t; /* loop variables, buffer */
|
|
357
|
+
ITEM *item; /* to traverse the items */
|
|
358
|
+
SYMCMPFN *cmp; /* comparison function */
|
|
359
|
+
|
|
360
|
+
assert(iset); /* check the function arguments */
|
|
361
|
+
if (dir > 1) cmp = _asccmpx; /* get the appropriate */
|
|
362
|
+
else if (dir > 0) cmp = _asccmp; /* comparison function */
|
|
363
|
+
else if (dir >= 0) cmp = _nocmp; /* (ascending/descending) */
|
|
364
|
+
else if (dir > -2) cmp = _descmp; /* and sort the items */
|
|
365
|
+
else cmp = _descmpx; /* w.r.t. their frequency */
|
|
366
|
+
nim_sort(iset->nimap, cmp, (void*)minfrq, map, 1);
|
|
367
|
+
for (n = nim_cnt(iset->nimap); --n >= 0; ) {
|
|
368
|
+
item = (ITEM*)nim_byid(iset->nimap, n);
|
|
369
|
+
if (item->frq < minfrq) /* determine frequent items and */
|
|
370
|
+
item->app = APP_NONE; /* set all others to 'ignore' */
|
|
371
|
+
else if (item->app != APP_NONE)
|
|
372
|
+
break; /* in addition, skip all items */
|
|
373
|
+
} /* that have been set to 'ignore' */
|
|
374
|
+
if (map) { /* if a map vector is provided */
|
|
375
|
+
for (i = k = 0; i < iset->cnt; i++) {
|
|
376
|
+
t = map[iset->items[i]]; /* traverse the current transaction */
|
|
377
|
+
if (t <= n) iset->items[k++] = t;
|
|
378
|
+
} /* recode all items and */
|
|
379
|
+
iset->cnt = k; /* delete all items to ignore */
|
|
380
|
+
ta_sort(iset->items, k); /* resort the items */
|
|
381
|
+
}
|
|
382
|
+
return n+1; /* return number of frequent items */
|
|
383
|
+
} /* is_recode() */
|
|
384
|
+
|
|
385
|
+
/*--------------------------------------------------------------------*/
|
|
386
|
+
|
|
387
|
+
int is_filter (ITEMSET *iset, const char *marks)
|
|
388
|
+
{ /* --- filter items in transaction */
|
|
389
|
+
return iset->cnt = ta_filter(iset->items, iset->cnt, marks);
|
|
390
|
+
} /* is_filter() */
|
|
391
|
+
|
|
392
|
+
/*----------------------------------------------------------------------
|
|
393
|
+
Item Set Evaluation Functions
|
|
394
|
+
----------------------------------------------------------------------*/
|
|
395
|
+
|
|
396
|
+
ISEVAL* ise_create (ITEMSET *iset, int tacnt)
|
|
397
|
+
{ /* --- create an item set evaluation */
|
|
398
|
+
int i; /* loop variable */
|
|
399
|
+
ISEVAL *eval; /* created item set evaluator */
|
|
400
|
+
|
|
401
|
+
i = is_cnt(iset); /* get the number of items */
|
|
402
|
+
eval = (ISEVAL*)malloc(sizeof(ISEVAL) +(i+i) *sizeof(double));
|
|
403
|
+
if (!eval) return NULL; /* create an evaluation object */
|
|
404
|
+
eval->logfs = eval->lsums +i +1; /* and organize the memory */
|
|
405
|
+
eval->logta = log(tacnt); /* store log of number of trans. */
|
|
406
|
+
while (--i >= 0) /* compute logarithms of item freqs. */
|
|
407
|
+
eval->logfs[i] = log(is_getfrq(iset, i));
|
|
408
|
+
eval->lsums[0] = 0; /* init. first sum of logarithms */
|
|
409
|
+
return eval; /* return created item set evaluator */
|
|
410
|
+
} /* ise_create() */
|
|
411
|
+
|
|
412
|
+
/*--------------------------------------------------------------------*/
|
|
413
|
+
|
|
414
|
+
double ise_eval (ISEVAL *eval, int *ids, int cnt, int pfx, double supp)
|
|
415
|
+
{ /* --- evaluate an item set */
|
|
416
|
+
double sum; /* sum of logarithms of frequencies */
|
|
417
|
+
|
|
418
|
+
sum = (pfx > 0) /* if there is a prefix, */
|
|
419
|
+
? eval->lsums[pfx-1] : 0; /* get already known logarithm sum */
|
|
420
|
+
for ( ; pfx < cnt; pfx++) /* compute and add remaining terms */
|
|
421
|
+
eval->lsums[pfx] = sum += eval->logfs[ids[pfx]];
|
|
422
|
+
return (log(supp) -sum +(cnt-1) *eval->logta) * (1.0/LN_2);
|
|
423
|
+
} /* ise_eval() */ /* compute logarithm of quotient */
|
|
424
|
+
|
|
425
|
+
/*----------------------------------------------------------------------
|
|
426
|
+
Item Set Formatting Functions
|
|
427
|
+
----------------------------------------------------------------------*/
|
|
428
|
+
|
|
429
|
+
ISFMTR* isf_create (ITEMSET *iset, int scan)
|
|
430
|
+
{ /* --- create an item set formatter */
|
|
431
|
+
int i, k, n; /* loop variable, buffers */
|
|
432
|
+
int len, sum; /* length of an item name and sum */
|
|
433
|
+
ISFMTR *fmt; /* created item set formatter */
|
|
434
|
+
char buf[4*TS_SIZE+4]; /* buffer for formatting */
|
|
435
|
+
const char *name; /* to traverse the item names */
|
|
436
|
+
char *copy; /* for copies of formatted names */
|
|
437
|
+
|
|
438
|
+
n = is_cnt(iset); /* get the number of items */
|
|
439
|
+
fmt = (ISFMTR*)malloc(sizeof(ISFMTR) + n *sizeof(int)
|
|
440
|
+
+(n-1) *sizeof(char*));
|
|
441
|
+
if (!fmt) return NULL; /* create the base structure */
|
|
442
|
+
fmt->buf = NULL; /* and organize the memory */
|
|
443
|
+
fmt->offs = (int*)(fmt->names +n);
|
|
444
|
+
for (i = sum = fmt->cnt = 0; i < n; i++) {
|
|
445
|
+
name = is_name(iset, i); /* traverse the item names */
|
|
446
|
+
len = strlen(name); /* and get their length */
|
|
447
|
+
sum += k = (scan) ? sc_format(buf, name, 0) : len;
|
|
448
|
+
if (k > len) { /* if formatting was needed */
|
|
449
|
+
copy = (char*)malloc((k+1) *sizeof(char));
|
|
450
|
+
if (!copy) { fmt->cnt = i-1; isf_delete(fmt); return NULL; }
|
|
451
|
+
name = strcpy(copy, buf); /* copy the formatted name */
|
|
452
|
+
} /* into a newly created string */
|
|
453
|
+
fmt->names[i] = name; /* store (formatted) item name */
|
|
454
|
+
} /* afterwards create output buffer */
|
|
455
|
+
if (scan) fmt->cnt = n; /* note the number of items */
|
|
456
|
+
fmt->buf = (char*)malloc((sum +n +1) *sizeof(char));
|
|
457
|
+
if (!fmt->buf) { isf_delete(fmt); return NULL; }
|
|
458
|
+
fmt->offs[0] = 0; /* init. the first prefix offset */
|
|
459
|
+
return fmt; /* return created item set formatter */
|
|
460
|
+
} /* isf_create() */
|
|
461
|
+
|
|
462
|
+
/*--------------------------------------------------------------------*/
|
|
463
|
+
|
|
464
|
+
void isf_delete (ISFMTR *fmt)
|
|
465
|
+
{ /* --- delete an item set formatter */
|
|
466
|
+
int i; /* loop variable */
|
|
467
|
+
for (i = fmt->cnt; --i >= 0; )
|
|
468
|
+
if ((fmt->names[i] != NULL)
|
|
469
|
+
&& (fmt->names[i][0] == '"'))
|
|
470
|
+
free((void*)fmt->names[i]);
|
|
471
|
+
if (fmt->buf) free(fmt->buf); /* delete reformatted item names, */
|
|
472
|
+
free(fmt); /* the output buffer and the base */
|
|
473
|
+
} /* isf_delete() */
|
|
474
|
+
|
|
475
|
+
/*--------------------------------------------------------------------*/
|
|
476
|
+
|
|
477
|
+
const char* isf_format (ISFMTR *fmt, int *ids, int cnt, int pre)
|
|
478
|
+
{ /* --- format an item set */
|
|
479
|
+
char *p; /* to traverse the output buffer */
|
|
480
|
+
const char *name; /* to traverse the item names */
|
|
481
|
+
|
|
482
|
+
p = fmt->buf +fmt->offs[pre]; /* get position for appending */
|
|
483
|
+
while (pre < cnt) { /* traverse the additional items */
|
|
484
|
+
name = fmt->names[ids[pre]];/* copy the item name to the output */
|
|
485
|
+
while (*name) *p++ = *name++;
|
|
486
|
+
*p++ = ' '; /* add an item separator */
|
|
487
|
+
fmt->offs[++pre] = (int)(p-fmt->buf);
|
|
488
|
+
} /* record the new offset */
|
|
489
|
+
*p = '\0'; /* terminate the formatted item set */
|
|
490
|
+
fmt->len = (int)(p-fmt->buf); /* note the length of the description */
|
|
491
|
+
return fmt->buf; /* return the output buffer */
|
|
492
|
+
} /* isf_format() */
|
|
493
|
+
|
|
494
|
+
/*----------------------------------------------------------------------
|
|
495
|
+
Transaction Functions
|
|
496
|
+
----------------------------------------------------------------------*/
|
|
497
|
+
|
|
498
|
+
int ta_unique (int *items, int n)
|
|
499
|
+
{ /* --- remove duplicate items */
|
|
500
|
+
int *s, *d; /* to traverse the item vector */
|
|
501
|
+
|
|
502
|
+
assert(items && (n >= 0)); /* check the function arguments */
|
|
503
|
+
if (n <= 1) return n; /* check for 0 or 1 item */
|
|
504
|
+
for (d = s = items; --n > 0;) /* traverse the sorted vector */
|
|
505
|
+
if (*++s != *d) *++d = *s; /* and remove duplicate items */
|
|
506
|
+
return (int)(++d -items); /* return the new number of items */
|
|
507
|
+
} /* ta_unique() */
|
|
508
|
+
|
|
509
|
+
/*--------------------------------------------------------------------*/
|
|
510
|
+
|
|
511
|
+
int ta_filter (int *items, int n, const char *marks)
|
|
512
|
+
{ /* --- filter items in a transaction */
|
|
513
|
+
int i, k; /* loop variables */
|
|
514
|
+
|
|
515
|
+
assert(items && (n >= 0)); /* check the function arguments */
|
|
516
|
+
for (i = k = 0; i < n; i++) /* remove all unmarked items */
|
|
517
|
+
if (marks[items[i]]) items[k++] = items[i];
|
|
518
|
+
return k; /* return the new number of items */
|
|
519
|
+
} /* ta_filter() */
|
|
520
|
+
|
|
521
|
+
/*--------------------------------------------------------------------*/
|
|
522
|
+
|
|
523
|
+
static int ta_cmp (const void *p1, const void *p2, void *data)
|
|
524
|
+
{ /* --- compare transactions */
|
|
525
|
+
int k, k1, k2; /* loop variable, counters */
|
|
526
|
+
const int *i1, *i2; /* to traverse the item identifiers */
|
|
527
|
+
|
|
528
|
+
assert(p1 && p2); /* check the function arguments */
|
|
529
|
+
i1 = ((const TRACT*)p1)->items;
|
|
530
|
+
i2 = ((const TRACT*)p2)->items;
|
|
531
|
+
k1 = ((const TRACT*)p1)->cnt; /* get the item vectors */
|
|
532
|
+
k2 = ((const TRACT*)p2)->cnt; /* and the numbers of items */
|
|
533
|
+
for (k = (k1 < k2) ? k1 : k2; --k >= 0; i1++, i2++) {
|
|
534
|
+
if (*i1 > *i2) return 1; /* compare corresponding items */
|
|
535
|
+
if (*i1 < *i2) return -1; /* and abort the comparison */
|
|
536
|
+
} /* if one of them is greater */
|
|
537
|
+
if (k1 > k2) return 1; /* if one of the transactions */
|
|
538
|
+
if (k1 < k2) return -1; /* is not empty, it is greater */
|
|
539
|
+
return 0; /* otherwise the two trans. are equal */
|
|
540
|
+
} /* ta_cmp() */
|
|
541
|
+
|
|
542
|
+
/*--------------------------------------------------------------------*/
|
|
543
|
+
|
|
544
|
+
static int ta_cmpx (const TRACT *ta, const int *items, int n)
|
|
545
|
+
{ /* --- compare transactions */
|
|
546
|
+
int k, m; /* loop variable, counter */
|
|
547
|
+
const int *p; /* to traverse the item identifiers */
|
|
548
|
+
|
|
549
|
+
assert(ta && items); /* check the function arguments */
|
|
550
|
+
p = ta->items; m = ta->cnt; /* traverse the item vector */
|
|
551
|
+
m = ta->cnt;
|
|
552
|
+
for (k = (n < m) ? n : m; --k >= 0; p++, items++) {
|
|
553
|
+
if (*p > *items) return 1; /* compare corresponding items */
|
|
554
|
+
if (*p < *items) return -1; /* and abort the comparison */
|
|
555
|
+
} /* if one of them is greater */
|
|
556
|
+
if (m > n) return 1; /* if one of the transactions */
|
|
557
|
+
if (m < n) return -1; /* is not empty, it is greater */
|
|
558
|
+
return 0; /* otherwise the two trans. are equal */
|
|
559
|
+
} /* ta_cmpx() */
|
|
560
|
+
|
|
561
|
+
/*----------------------------------------------------------------------
|
|
562
|
+
Transaction Set Functions
|
|
563
|
+
----------------------------------------------------------------------*/
|
|
564
|
+
|
|
565
|
+
TASET* tas_create (ITEMSET *itemset)
|
|
566
|
+
{ /* --- create a transaction set */
|
|
567
|
+
TASET *taset; /* created transaction set */
|
|
568
|
+
|
|
569
|
+
assert(itemset); /* check the function argument */
|
|
570
|
+
taset = malloc(sizeof(TASET));
|
|
571
|
+
if (!taset) return NULL; /* create a transaction set */
|
|
572
|
+
taset->itemset = itemset; /* and store the item set */
|
|
573
|
+
taset->cnt = taset->vsz = taset->max = taset->total = 0;
|
|
574
|
+
taset->tracts = NULL; /* initialize the other fields */
|
|
575
|
+
return taset; /* return the created t.a. set */
|
|
576
|
+
} /* tas_create() */
|
|
577
|
+
|
|
578
|
+
/*--------------------------------------------------------------------*/
|
|
579
|
+
|
|
580
|
+
void tas_delete (TASET *taset, int delis)
|
|
581
|
+
{ /* --- delete a transaction set */
|
|
582
|
+
assert(taset); /* check the function argument */
|
|
583
|
+
if (taset->tracts) { /* if there are loaded transactions */
|
|
584
|
+
while (--taset->cnt >= 0) /* traverse the transaction vector */
|
|
585
|
+
free(taset->tracts[taset->cnt]);
|
|
586
|
+
free(taset->tracts); /* delete all transactions */
|
|
587
|
+
} /* and the transaction vector */
|
|
588
|
+
if (delis && taset->itemset) is_delete(taset->itemset);
|
|
589
|
+
free(taset); /* delete the item set and */
|
|
590
|
+
} /* tas_delete() */ /* the transaction set body */
|
|
591
|
+
|
|
592
|
+
/*--------------------------------------------------------------------*/
|
|
593
|
+
|
|
594
|
+
int tas_add (TASET *taset, const int *items, int n)
|
|
595
|
+
{ /* --- add a transaction */
|
|
596
|
+
TRACT *ta; /* new transaction */
|
|
597
|
+
int *p; /* to traverse the transaction */
|
|
598
|
+
TRACT **vec; /* new transaction vector */
|
|
599
|
+
int size; /* new transaction vector size */
|
|
600
|
+
|
|
601
|
+
assert(taset); /* check the function arguments */
|
|
602
|
+
size = taset->vsz; /* get the transaction vector size */
|
|
603
|
+
if (taset->cnt >= size) { /* if the transaction vector is full */
|
|
604
|
+
size += (size > BLKSIZE) ? (size >> 1) : BLKSIZE;
|
|
605
|
+
vec = (TRACT**)realloc(taset->tracts, size *sizeof(TRACT*));
|
|
606
|
+
if (!vec) return -1; /* enlarge the transaction vector */
|
|
607
|
+
taset->tracts = vec; taset->vsz = size;
|
|
608
|
+
} /* set the new vector and its size */
|
|
609
|
+
if (!items) { /* if no transaction is given */
|
|
610
|
+
items = is_tract(taset->itemset);
|
|
611
|
+
n = is_tsize(taset->itemset);
|
|
612
|
+
} /* get it from the item set */
|
|
613
|
+
ta = (TRACT*)malloc(sizeof(TRACT) +(n-1) *sizeof(int));
|
|
614
|
+
if (!ta) return -1; /* create a new transaction */
|
|
615
|
+
taset->tracts[taset->cnt++] = ta;
|
|
616
|
+
if (n > taset->max) /* store the transaction and */
|
|
617
|
+
taset->max = n; /* update maximal transaction size */
|
|
618
|
+
taset->total += n; /* sum the number of items */
|
|
619
|
+
for (p = ta->items +(ta->cnt = n); --n >= 0; )
|
|
620
|
+
*--p = items[n]; /* copy the items of the t.a. */
|
|
621
|
+
return 0; /* return 'ok' */
|
|
622
|
+
} /* tas_add() */
|
|
623
|
+
|
|
624
|
+
/*--------------------------------------------------------------------*/
|
|
625
|
+
|
|
626
|
+
void tas_recode (TASET *taset, int *map, int cnt)
|
|
627
|
+
{ /* --- recode items */
|
|
628
|
+
int i, k, n, x; /* loop variables, buffer */
|
|
629
|
+
TRACT *t; /* to traverse the transactions */
|
|
630
|
+
int *p; /* to traverse the item identifiers */
|
|
631
|
+
|
|
632
|
+
assert(taset && map); /* check the function arguments */
|
|
633
|
+
taset->max = taset->total = 0;/* clear the maximal size and total */
|
|
634
|
+
for (n = taset->cnt; --n >= 0; ) {
|
|
635
|
+
t = taset->tracts[n]; /* traverse the transactions and */
|
|
636
|
+
p = t->items; /* the items of each transaction */
|
|
637
|
+
for (i = k = 0; i < t->cnt; i++) {
|
|
638
|
+
x = map[p[i]]; /* recode the items and */
|
|
639
|
+
if (x < cnt) p[k++] = x; /* remove superfluous items */
|
|
640
|
+
} /* from the transaction */
|
|
641
|
+
if (k > taset->max) /* update the max. transaction size */
|
|
642
|
+
taset->max = k; /* with the new size of the t.a. */
|
|
643
|
+
taset->total += k; /* sum the number of items */
|
|
644
|
+
ta_sort(t->items, t->cnt = k);
|
|
645
|
+
} /* resort the item identifiers */
|
|
646
|
+
} /* tas_recode() */
|
|
647
|
+
|
|
648
|
+
/*--------------------------------------------------------------------*/
|
|
649
|
+
|
|
650
|
+
int tas_filter (TASET *taset, const char *marks)
|
|
651
|
+
{ /* --- filter items in a trans. set */
|
|
652
|
+
int i, max = 0; /* loop variable, max. num. of items */
|
|
653
|
+
TRACT *t; /* to traverse the transactions */
|
|
654
|
+
|
|
655
|
+
assert(taset && marks); /* check the function arguments */
|
|
656
|
+
taset->total = 0; /* clear the total number of items */
|
|
657
|
+
for (i = taset->cnt; --i >= 0; ) {
|
|
658
|
+
t = taset->tracts[i]; /* traverse the transactions */
|
|
659
|
+
t->cnt = ta_filter(t->items, t->cnt, marks);
|
|
660
|
+
if (t->cnt > max) max = t->cnt;
|
|
661
|
+
taset->total += t->cnt; /* filter each transaction and */
|
|
662
|
+
} /* update maximal size and total */
|
|
663
|
+
return max; /* return maximum number of items */
|
|
664
|
+
} /* tas_filter() */
|
|
665
|
+
|
|
666
|
+
/*--------------------------------------------------------------------*/
|
|
667
|
+
|
|
668
|
+
void tas_sort (TASET *taset, int heap)
|
|
669
|
+
{ /* --- sort a transaction set */
|
|
670
|
+
assert(taset); /* check the function argument */
|
|
671
|
+
if (heap) v_heapsort(taset->tracts, taset->cnt, ta_cmp, NULL);
|
|
672
|
+
else v_sort (taset->tracts, taset->cnt, ta_cmp, NULL);
|
|
673
|
+
} /* tas_sort() */
|
|
674
|
+
|
|
675
|
+
/*--------------------------------------------------------------------*/
|
|
676
|
+
|
|
677
|
+
int tas_occur (TASET *taset, const int *items, int n)
|
|
678
|
+
{ /* --- count transaction occurrences */
|
|
679
|
+
int l, r, m, k = taset->cnt; /* index variables */
|
|
680
|
+
|
|
681
|
+
assert(taset && items); /* check the function arguments */
|
|
682
|
+
for (r = m = 0; r < k; ) { /* find right boundary */
|
|
683
|
+
m = (r + k) >> 1; /* by a binary search */
|
|
684
|
+
if (ta_cmpx(taset->tracts[m], items, n) > 0) k = m;
|
|
685
|
+
else r = m+1;
|
|
686
|
+
}
|
|
687
|
+
for (l = m = 0; l < k; ) { /* find left boundary */
|
|
688
|
+
m = (l + k) >> 1; /* by a binary search */
|
|
689
|
+
if (ta_cmpx(taset->tracts[m], items, n) < 0) l = m+1;
|
|
690
|
+
else k = m;
|
|
691
|
+
}
|
|
692
|
+
return r -l; /* compute the number of occurrences */
|
|
693
|
+
} /* tas_occur() */
|
|
694
|
+
|
|
695
|
+
/*--------------------------------------------------------------------*/
|
|
696
|
+
#ifndef NDEBUG
|
|
697
|
+
|
|
698
|
+
void tas_show (TASET *taset)
|
|
699
|
+
{ /* --- show a transaction set */
|
|
700
|
+
int i, k; /* loop variables */
|
|
701
|
+
TRACT *t; /* to traverse the transactions */
|
|
702
|
+
|
|
703
|
+
assert(taset); /* check the function argument */
|
|
704
|
+
for (i = 0; i < taset->cnt; i++) {
|
|
705
|
+
t = taset->tracts[i]; /* traverse the transactions */
|
|
706
|
+
for (k = 0; k < t->cnt; k++) { /* traverse the items */
|
|
707
|
+
if (k > 0) putc(' ', stdout); /* print a separator */
|
|
708
|
+
printf(is_name(taset->itemset, t->items[k]));
|
|
709
|
+
} /* print the next item */
|
|
710
|
+
putc('\n', stdout); /* terminate the transaction */
|
|
711
|
+
} /* finally print the number of t.a. */
|
|
712
|
+
printf("%d transaction(s)\n", taset->cnt);
|
|
713
|
+
} /* tas_show() */
|
|
714
|
+
|
|
715
|
+
#endif
|
|
716
|
+
/*----------------------------------------------------------------------
|
|
717
|
+
Transaction Tree Functions
|
|
718
|
+
----------------------------------------------------------------------*/
|
|
719
|
+
|
|
720
|
+
TATREE* _create (TRACT **tracts, int cnt, int index)
|
|
721
|
+
{ /* --- recursive part of tat_create() */
|
|
722
|
+
int i, k, t; /* loop variables, buffer */
|
|
723
|
+
int item, n; /* item and item counter */
|
|
724
|
+
TATREE *tat; /* created transaction tree */
|
|
725
|
+
TATREE **vec; /* vector of child pointers */
|
|
726
|
+
|
|
727
|
+
assert(tracts /* check the function arguments */
|
|
728
|
+
&& (cnt >= 0) && (index >= 0));
|
|
729
|
+
if (cnt <= 1) { /* if only one transaction left */
|
|
730
|
+
n = (cnt > 0) ? (*tracts)->cnt -index : 0;
|
|
731
|
+
tat = (TATREE*)malloc(sizeof(TATREE) +(n-1) *sizeof(int));
|
|
732
|
+
if (!tat) return NULL; /* create a transaction tree node */
|
|
733
|
+
tat->cnt = cnt; /* and initialize its fields */
|
|
734
|
+
tat->size = -n;
|
|
735
|
+
tat->max = n;
|
|
736
|
+
while (--n >= 0) tat->items[n] = (*tracts)->items[index +n];
|
|
737
|
+
return tat;
|
|
738
|
+
}
|
|
739
|
+
for (k = cnt; (--k >= 0) && ((*tracts)->cnt <= index); )
|
|
740
|
+
tracts++; /* skip t.a. that are too short */
|
|
741
|
+
n = 0; item = -1; /* init. item and item counter */
|
|
742
|
+
for (tracts += i = ++k; --i >= 0; ) {
|
|
743
|
+
t = (*--tracts)->items[index]; /* traverse the transactions */
|
|
744
|
+
if (t != item) { item = t; n++; }
|
|
745
|
+
} /* count the different items */
|
|
746
|
+
#ifdef ARCH64 /* adapt to even item number */
|
|
747
|
+
i = (n & 1) ? n : (n+1); /* so that pointer addresses are */
|
|
748
|
+
#else /* multiples of 8 on 64 bit systems */
|
|
749
|
+
i = n; /* on 32 bit systems, however, */
|
|
750
|
+
#endif /* use the exact number of items */
|
|
751
|
+
tat = (TATREE*)malloc(sizeof(TATREE) + (i-1) *sizeof(int)
|
|
752
|
+
+ n *sizeof(TATREE*));
|
|
753
|
+
if (!tat) return NULL; /* create a transaction tree node */
|
|
754
|
+
tat->cnt = cnt; /* and initialize its fields */
|
|
755
|
+
tat->size = n;
|
|
756
|
+
tat->max = 0;
|
|
757
|
+
if (n <= 0) return tat; /* if t.a. are fully captured, abort */
|
|
758
|
+
vec = (TATREE**)(tat->items +i);
|
|
759
|
+
item = tracts[--k]->items[index];
|
|
760
|
+
for (tracts += i = k; --i >= 0; ) {
|
|
761
|
+
t = (*--tracts)->items[index]; /* traverse the transactions, */
|
|
762
|
+
if (t == item) continue; /* but skip those with the same item */
|
|
763
|
+
tat->items[--n] = item; item = t;
|
|
764
|
+
vec[n] = _create(tracts+1, k-i, index+1);
|
|
765
|
+
if (!vec[n]) break; /* note the item identifier */
|
|
766
|
+
t = vec[n]->max +1; if (t > tat->max) tat->max = t;
|
|
767
|
+
k = i; /* recursively create subtrees */
|
|
768
|
+
} /* and adapt the section end index */
|
|
769
|
+
if (i < 0) { /* if child creation was successful */
|
|
770
|
+
tat->items[--n] = item; /* note the last item identifier */
|
|
771
|
+
vec[n] = _create(tracts, k+1, index+1);
|
|
772
|
+
if (vec[n]) { /* create the last child */
|
|
773
|
+
t = vec[n]->max +1; if (t > tat->max) tat->max = t;
|
|
774
|
+
return tat; /* return the created */
|
|
775
|
+
} /* transaction tree */
|
|
776
|
+
}
|
|
777
|
+
for (i = tat->size; --i > n; ) tat_delete(vec[i]);
|
|
778
|
+
free(tat); /* on error delete created subtrees */
|
|
779
|
+
return NULL; /* and the transaction tree node */
|
|
780
|
+
} /* _create() */
|
|
781
|
+
|
|
782
|
+
/*--------------------------------------------------------------------*/
|
|
783
|
+
|
|
784
|
+
TATREE* tat_create (TASET *taset, int heap)
|
|
785
|
+
{ /* --- create a transactions tree */
|
|
786
|
+
assert(taset); /* check the function argument */
|
|
787
|
+
if (heap) v_heapsort(taset->tracts, taset->cnt, ta_cmp, NULL);
|
|
788
|
+
else v_sort (taset->tracts, taset->cnt, ta_cmp, NULL);
|
|
789
|
+
return _create(taset->tracts, taset->cnt, 0);
|
|
790
|
+
} /* tat_create() */
|
|
791
|
+
|
|
792
|
+
/*--------------------------------------------------------------------*/
|
|
793
|
+
|
|
794
|
+
void tat_delete (TATREE *tat)
|
|
795
|
+
{ /* --- delete a transaction tree */
|
|
796
|
+
int i; /* loop variable */
|
|
797
|
+
TATREE **vec; /* vector of child nodes */
|
|
798
|
+
|
|
799
|
+
assert(tat); /* check the function argument */
|
|
800
|
+
#ifdef ARCH64 /* if 64 bit architecture */
|
|
801
|
+
i = (tat->size & 1) ? tat->size : (tat->size+1);
|
|
802
|
+
#else /* address must be a multiple of 8 */
|
|
803
|
+
i = tat->size; /* on 32 bit systems, however, */
|
|
804
|
+
#endif /* use the number of items directly */
|
|
805
|
+
vec = (TATREE**)(tat->items +i);
|
|
806
|
+
for (i = tat->size; --i >= 0; )
|
|
807
|
+
tat_delete(vec[i]); /* recursively delete the subtrees */
|
|
808
|
+
free(tat); /* and the tree node itself */
|
|
809
|
+
} /* tat_delete() */
|
|
810
|
+
|
|
811
|
+
/*--------------------------------------------------------------------*/
|
|
812
|
+
#ifdef ARCH64
|
|
813
|
+
|
|
814
|
+
TATREE* tat_child (TATREE *tat, int index)
|
|
815
|
+
{ /* --- go to a child node */
|
|
816
|
+
int s; /* padded size of the node */
|
|
817
|
+
|
|
818
|
+
assert(tat /* check the function arguments */
|
|
819
|
+
&& (index >= 0) && (index < tat->size));
|
|
820
|
+
s = (tat->size & 1) ? tat->size : (tat->size +1);
|
|
821
|
+
return ((TATREE**)(tat->items +s))[index];
|
|
822
|
+
} /* tat_child */ /* return the child node/subtree */
|
|
823
|
+
|
|
824
|
+
#endif
|
|
825
|
+
/*--------------------------------------------------------------------*/
|
|
826
|
+
|
|
827
|
+
void tat_mark (TATREE *tat)
|
|
828
|
+
{ /* --- mark end of transactions */
|
|
829
|
+
int i; /* loop variable */
|
|
830
|
+
|
|
831
|
+
assert(tat); /* check the function argument */
|
|
832
|
+
if (tat->size < 0) /* if there is a transaction, */
|
|
833
|
+
tat->items[tat->max-1] |= INT_MIN; /* mark end of trans. */
|
|
834
|
+
else { /* if there are subtrees */
|
|
835
|
+
for (i = tat->size; --i >= 0; )
|
|
836
|
+
tat_mark(tat_child(tat, i));
|
|
837
|
+
} /* recursively mark the subtrees */
|
|
838
|
+
} /* tat_mark() */
|
|
839
|
+
|
|
840
|
+
/*--------------------------------------------------------------------*/
|
|
841
|
+
#ifndef NDEBUG
|
|
842
|
+
|
|
843
|
+
void _show (TATREE *tat, int ind)
|
|
844
|
+
{ /* --- rekursive part of tat_show() */
|
|
845
|
+
int i, k; /* loop variables */
|
|
846
|
+
TATREE **vec; /* vector of child nodes */
|
|
847
|
+
|
|
848
|
+
assert(tat && (ind >= 0)); /* check the function arguments */
|
|
849
|
+
if (tat->size <= 0) { /* if this is a leaf node */
|
|
850
|
+
for (i = 0; i < tat->max; i++)
|
|
851
|
+
printf("%d ", tat->items[i] & ~INT_MIN);
|
|
852
|
+
printf("\n"); return; /* print the items in the */
|
|
853
|
+
} /* (rest of) the transaction */
|
|
854
|
+
vec = (TATREE**)(tat->items +tat->size);
|
|
855
|
+
for (i = 0; i < tat->size; i++) {
|
|
856
|
+
if (i > 0) for (k = ind; --k >= 0; ) printf(" ");
|
|
857
|
+
printf("%d ", tat->items[i]);
|
|
858
|
+
_show(vec[i], ind+1); /* traverse the items, print them, */
|
|
859
|
+
} /* and show the children recursively */
|
|
860
|
+
} /* _show() */
|
|
861
|
+
|
|
862
|
+
/*--------------------------------------------------------------------*/
|
|
863
|
+
|
|
864
|
+
void tat_show (TATREE *tat)
|
|
865
|
+
{ /* --- show a transaction tree */
|
|
866
|
+
assert(tat); /* check the function argument */
|
|
867
|
+
_show(tat, 0); /* just call the recursive function */
|
|
868
|
+
} /* tat_show() */
|
|
869
|
+
|
|
870
|
+
#endif
|