apriori-rails 0.2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +22 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +17 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +88 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +39 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +97 -0
- data/lib/apriori/version.rb +3 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +251 -0
- data/website/index.txt +154 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +267 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
# Microsoft Developer Studio Project File - Name="apriori" - Package Owner=<4>
|
2
|
+
# Microsoft Developer Studio Generated Build File, Format Version 6.00
|
3
|
+
# ** NICHT BEARBEITEN **
|
4
|
+
|
5
|
+
# TARGTYPE "Win32 (x86) Console Application" 0x0103
|
6
|
+
|
7
|
+
CFG=apriori - Win32 Debug
|
8
|
+
!MESSAGE Dies ist kein g�ltiges Makefile. Zum Erstellen dieses Projekts mit NMAKE
|
9
|
+
!MESSAGE verwenden Sie den Befehl "Makefile exportieren" und f�hren Sie den Befehl
|
10
|
+
!MESSAGE
|
11
|
+
!MESSAGE NMAKE /f "apriori.mak".
|
12
|
+
!MESSAGE
|
13
|
+
!MESSAGE Sie k�nnen beim Ausf�hren von NMAKE eine Konfiguration angeben
|
14
|
+
!MESSAGE durch Definieren des Makros CFG in der Befehlszeile. Zum Beispiel:
|
15
|
+
!MESSAGE
|
16
|
+
!MESSAGE NMAKE /f "apriori.mak" CFG="apriori - Win32 Debug"
|
17
|
+
!MESSAGE
|
18
|
+
!MESSAGE F�r die Konfiguration stehen zur Auswahl:
|
19
|
+
!MESSAGE
|
20
|
+
!MESSAGE "apriori - Win32 Release" (basierend auf "Win32 (x86) Console Application")
|
21
|
+
!MESSAGE "apriori - Win32 Debug" (basierend auf "Win32 (x86) Console Application")
|
22
|
+
!MESSAGE
|
23
|
+
|
24
|
+
# Begin Project
|
25
|
+
# PROP AllowPerConfigDependencies 0
|
26
|
+
# PROP Scc_ProjName ""
|
27
|
+
# PROP Scc_LocalPath ""
|
28
|
+
CPP=cl.exe
|
29
|
+
RSC=rc.exe
|
30
|
+
|
31
|
+
!IF "$(CFG)" == "apriori - Win32 Release"
|
32
|
+
|
33
|
+
# PROP BASE Use_MFC 0
|
34
|
+
# PROP BASE Use_Debug_Libraries 0
|
35
|
+
# PROP BASE Output_Dir "Release"
|
36
|
+
# PROP BASE Intermediate_Dir "Release"
|
37
|
+
# PROP BASE Target_Dir ""
|
38
|
+
# PROP Use_MFC 0
|
39
|
+
# PROP Use_Debug_Libraries 0
|
40
|
+
# PROP Output_Dir "Release"
|
41
|
+
# PROP Intermediate_Dir "Release"
|
42
|
+
# PROP Target_Dir ""
|
43
|
+
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
44
|
+
# ADD CPP /nologo /W3 /GX /O2 /I "..\..\util\src" /I "..\..\math\src" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /D "NIMAPFN" /YX /FD /c
|
45
|
+
# ADD BASE RSC /l 0x407 /d "NDEBUG"
|
46
|
+
# ADD RSC /l 0x407 /d "NDEBUG"
|
47
|
+
BSC32=bscmake.exe
|
48
|
+
# ADD BASE BSC32 /nologo
|
49
|
+
# ADD BSC32 /nologo
|
50
|
+
LINK32=link.exe
|
51
|
+
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
52
|
+
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
53
|
+
|
54
|
+
!ELSEIF "$(CFG)" == "apriori - Win32 Debug"
|
55
|
+
|
56
|
+
# PROP BASE Use_MFC 0
|
57
|
+
# PROP BASE Use_Debug_Libraries 1
|
58
|
+
# PROP BASE Output_Dir "Debug"
|
59
|
+
# PROP BASE Intermediate_Dir "Debug"
|
60
|
+
# PROP BASE Target_Dir ""
|
61
|
+
# PROP Use_MFC 0
|
62
|
+
# PROP Use_Debug_Libraries 1
|
63
|
+
# PROP Output_Dir "Debug"
|
64
|
+
# PROP Intermediate_Dir "Debug"
|
65
|
+
# PROP Target_Dir ""
|
66
|
+
# ADD BASE CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
67
|
+
# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /I "..\..\util\src" /I "..\..\math\src" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "NIMAPFN" /YX /FD /c
|
68
|
+
# ADD BASE RSC /l 0x407 /d "_DEBUG"
|
69
|
+
# ADD RSC /l 0x407 /d "_DEBUG"
|
70
|
+
BSC32=bscmake.exe
|
71
|
+
# ADD BASE BSC32 /nologo
|
72
|
+
# ADD BSC32 /nologo
|
73
|
+
LINK32=link.exe
|
74
|
+
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
75
|
+
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
76
|
+
|
77
|
+
!ENDIF
|
78
|
+
|
79
|
+
# Begin Target
|
80
|
+
|
81
|
+
# Name "apriori - Win32 Release"
|
82
|
+
# Name "apriori - Win32 Debug"
|
83
|
+
# Begin Source File
|
84
|
+
|
85
|
+
SOURCE=.\apriori.c
|
86
|
+
# End Source File
|
87
|
+
# Begin Source File
|
88
|
+
|
89
|
+
SOURCE=.\tract.c
|
90
|
+
# End Source File
|
91
|
+
# Begin Source File
|
92
|
+
|
93
|
+
SOURCE=.\istree.c
|
94
|
+
# End Source File
|
95
|
+
# Begin Source File
|
96
|
+
|
97
|
+
SOURCE=..\..\util\src\scan.c
|
98
|
+
# End Source File
|
99
|
+
# Begin Source File
|
100
|
+
|
101
|
+
SOURCE=..\..\util\src\symtab.c
|
102
|
+
# End Source File
|
103
|
+
# Begin Source File
|
104
|
+
|
105
|
+
SOURCE=..\..\util\src\tabscan.c
|
106
|
+
# End Source File
|
107
|
+
# Begin Source File
|
108
|
+
|
109
|
+
SOURCE=..\..\util\src\vecops.c
|
110
|
+
# End Source File
|
111
|
+
# Begin Source File
|
112
|
+
|
113
|
+
SOURCE=..\..\math\src\gamma.c
|
114
|
+
# End Source File
|
115
|
+
# Begin Source File
|
116
|
+
|
117
|
+
SOURCE=..\..\math\src\chi2.c
|
118
|
+
# End Source File
|
119
|
+
# End Target
|
120
|
+
# End Project
|
@@ -0,0 +1,29 @@
|
|
1
|
+
Microsoft Developer Studio Workspace File, Format Version 6.00
|
2
|
+
# WARNUNG: DIESE ARBEITSBEREICHSDATEI DARF NICHT BEARBEITET ODER GEL�SCHT WERDEN!
|
3
|
+
|
4
|
+
###############################################################################
|
5
|
+
|
6
|
+
Project: "apriori"=.\apriori.dsp - Package Owner=<4>
|
7
|
+
|
8
|
+
Package=<5>
|
9
|
+
{{{
|
10
|
+
}}}
|
11
|
+
|
12
|
+
Package=<4>
|
13
|
+
{{{
|
14
|
+
}}}
|
15
|
+
|
16
|
+
###############################################################################
|
17
|
+
|
18
|
+
Global:
|
19
|
+
|
20
|
+
Package=<5>
|
21
|
+
{{{
|
22
|
+
}}}
|
23
|
+
|
24
|
+
Package=<3>
|
25
|
+
{{{
|
26
|
+
}}}
|
27
|
+
|
28
|
+
###############################################################################
|
29
|
+
|
@@ -0,0 +1,99 @@
|
|
1
|
+
#-----------------------------------------------------------------------
|
2
|
+
# File : apriori.mak
|
3
|
+
# Contents: build apriori program
|
4
|
+
# Author : Christian Borgelt
|
5
|
+
# History : 26.01.2003 file created
|
6
|
+
# 20.07.2006 adapted to Visual Studio 8
|
7
|
+
#-----------------------------------------------------------------------
|
8
|
+
CC = cl.exe
|
9
|
+
LD = link.exe
|
10
|
+
DEFS = /D WIN32 /D NDEBUG /D _CONSOLE /D _MBCS \
|
11
|
+
/D _CRT_SECURE_NO_DEPRECATE
|
12
|
+
CFLAGS = /nologo /W3 /EHsc /O2 /I $(UTILDIR) /I $(MATHDIR) $(DEFS) /FD /c
|
13
|
+
LDFLAGS = /nologo /subsystem:console /incremental:no /machine:X86
|
14
|
+
|
15
|
+
THISDIR = ..\..\apriori\src
|
16
|
+
UTILDIR = ..\..\util\src
|
17
|
+
MATHDIR = ..\..\math\src
|
18
|
+
HDRS = $(UTILDIR)\vecops.h $(UTILDIR)\symtab.h \
|
19
|
+
$(UTILDIR)\tabscan.h $(UTILDIR)\scan.h \
|
20
|
+
$(MATHDIR)\gamma.h $(MATHDIR)\chi2.o \
|
21
|
+
tract.h istree.h
|
22
|
+
OBJS = $(UTILDIR)\vecops.obj $(UTILDIR)\nimap.obj \
|
23
|
+
$(UTILDIR)\tabscan.obj $(UTILDIR)\scan.obj \
|
24
|
+
$(MATHDIR)\gamma.obj $(MATHDIR)\chi2.obj \
|
25
|
+
tract.obj istree.obj apriori.obj
|
26
|
+
|
27
|
+
#-----------------------------------------------------------------------
|
28
|
+
# Build Program
|
29
|
+
#-----------------------------------------------------------------------
|
30
|
+
all: apriori.exe
|
31
|
+
|
32
|
+
apriori.exe: $(OBJS)
|
33
|
+
$(LD) $(LDFLAGS) $(OBJS) $(LIBS) /out:$@
|
34
|
+
|
35
|
+
#-----------------------------------------------------------------------
|
36
|
+
# Item and Transaction Management
|
37
|
+
#-----------------------------------------------------------------------
|
38
|
+
tract.obj: $(UTILDIR)\symtab.h tract.h tract.c apriori.mak
|
39
|
+
$(CC) $(CFLAGS) tract.c /Fo$@
|
40
|
+
|
41
|
+
#-----------------------------------------------------------------------
|
42
|
+
# Frequent Item Set Tree Management
|
43
|
+
#-----------------------------------------------------------------------
|
44
|
+
istree.obj: $(MATHDIR)\gamma.h tract.h istree.h istree.c apriori.mak
|
45
|
+
$(CC) $(CFLAGS) istree.c /Fo$@
|
46
|
+
|
47
|
+
#-----------------------------------------------------------------------
|
48
|
+
# Main Program
|
49
|
+
#-----------------------------------------------------------------------
|
50
|
+
apriori.obj: $(UTILDIR)\symtab.h tract.h istree.h apriori.c apriori.mak
|
51
|
+
$(CC) $(CFLAGS) /D NIMAPFN apriori.c /Fo$@
|
52
|
+
|
53
|
+
#-----------------------------------------------------------------------
|
54
|
+
# External Modules
|
55
|
+
#-----------------------------------------------------------------------
|
56
|
+
$(UTILDIR)\vecops.obj:
|
57
|
+
cd $(UTILDIR)
|
58
|
+
$(MAKE) /f util.mak vecops.obj
|
59
|
+
cd $(THISDIR)
|
60
|
+
$(UTILDIR)\nimap.obj:
|
61
|
+
cd $(UTILDIR)
|
62
|
+
$(MAKE) /f util.mak nimap.obj
|
63
|
+
cd $(THISDIR)
|
64
|
+
$(UTILDIR)\tabscan.obj:
|
65
|
+
cd $(UTILDIR)
|
66
|
+
$(MAKE) /f util.mak tabscan.obj
|
67
|
+
cd $(THISDIR)
|
68
|
+
$(UTILDIR)\scan.obj:
|
69
|
+
cd $(UTILDIR)
|
70
|
+
$(MAKE) /f util.mak scan.obj
|
71
|
+
cd $(THISDIR)
|
72
|
+
$(MATHDIR)\gamma.obj:
|
73
|
+
cd $(MATHDIR)
|
74
|
+
$(MAKE) /f math.mak gamma.obj
|
75
|
+
cd $(THISDIR)
|
76
|
+
$(MATHDIR)\chi2.obj:
|
77
|
+
cd $(MATHDIR)
|
78
|
+
$(MAKE) /f math.mak chi2.obj
|
79
|
+
cd $(THISDIR)
|
80
|
+
|
81
|
+
#-----------------------------------------------------------------------
|
82
|
+
# Install
|
83
|
+
#-----------------------------------------------------------------------
|
84
|
+
install:
|
85
|
+
-@copy apriori.exe c:\home\bin
|
86
|
+
|
87
|
+
#-----------------------------------------------------------------------
|
88
|
+
# Clean up
|
89
|
+
#-----------------------------------------------------------------------
|
90
|
+
clean:
|
91
|
+
$(MAKE) /f apriori.mak localclean
|
92
|
+
cd $(UTILDIR)
|
93
|
+
$(MAKE) /f util.mak clean
|
94
|
+
cd $(MATHDIR)
|
95
|
+
$(MAKE) /f math.mak clean
|
96
|
+
cd $(THISDIR)
|
97
|
+
|
98
|
+
localclean:
|
99
|
+
-@erase /Q *~ *.obj *.idb *.pch apriori.exe
|
@@ -0,0 +1,1411 @@
|
|
1
|
+
/*----------------------------------------------------------------------
|
2
|
+
File : istree.c
|
3
|
+
Contents: item set tree management
|
4
|
+
Author : Christian Borgelt
|
5
|
+
History : 1996.01.22 file created
|
6
|
+
1996.02.07 _child, _count, ist_addlvl, and ist_count
|
7
|
+
1996.02.09 ist_rule programmed and debugged
|
8
|
+
1996.02.10 empty rule bodies made optional
|
9
|
+
1996.03.28 support made relative to number of item sets
|
10
|
+
1996.06.25 function _count optimized
|
11
|
+
1996.11.23 rule extraction redesigned
|
12
|
+
1996.11.24 rule selection criteria added
|
13
|
+
1997.08.18 normalized chi^2 measure added
|
14
|
+
parameter minlen added to function ist_init()
|
15
|
+
1998.01.15 confidence comparison changed to >=
|
16
|
+
1998.01.23 integer support computation changed (ceil)
|
17
|
+
1998.01.26 condition added to set extension in _child
|
18
|
+
1998.02.10 bug in computation of EM_INFO fixed
|
19
|
+
1998.02.11 parameter 'minval' added to function ist_init()
|
20
|
+
1998.05.14 item set tree navigation functions added
|
21
|
+
1998.08.08 item appearances considered for rule selection
|
22
|
+
1998.08.20 deferred child node vector allocation added
|
23
|
+
1998.09.02 several assertions added
|
24
|
+
1998.09.05 bug concerning node id fixed
|
25
|
+
1998.09.07 function ist_hedge added
|
26
|
+
1998.09.22 bug in rule extraction (item appearances) fixed
|
27
|
+
1998.09.23 computation of chi^2 measure simplified
|
28
|
+
1999.02.05 long int changed to int
|
29
|
+
1999.08.25 rule extraction simplified
|
30
|
+
1999.11.05 rule evaluation measure EM_AIMP added
|
31
|
+
1999.11.08 parameter 'aval' added to function ist_rule
|
32
|
+
1999.11.11 rule consequents moved to first field
|
33
|
+
1999.12.01 bug in node reallocation fixed
|
34
|
+
2001.04.01 functions ist_set and ist_getcntx added,
|
35
|
+
functions _count and _getsupp improved
|
36
|
+
2001.12.28 sort function moved to module tract
|
37
|
+
2002.02.07 tree clearing removed, counting improved
|
38
|
+
2002.02.08 child creation improved (check of body support)
|
39
|
+
2002.02.10 IST_IGNORE bugs fixed (ist_set and ist_hedge)
|
40
|
+
2002.02.11 memory usage minimization option added
|
41
|
+
2002.02.12 ist_first and ist_last replaced by ist_next
|
42
|
+
2002.02.19 transaction tree functions added
|
43
|
+
2002.10.09 bug in function ist_hedge fixed (conf. comp.)
|
44
|
+
2003.03.12 parameter lift added to function ist_rule
|
45
|
+
2003.07.17 check of item usage added (function ist_check)
|
46
|
+
2003.07.18 maximally frequent item set filter added
|
47
|
+
2003.08.11 item set filtering generalized (ist_filter)
|
48
|
+
2003.08.15 renamed new to cur in ist_addlvl (C++ compat.)
|
49
|
+
2003.11.14 definition of F_HDONLY changed to INT_MIN
|
50
|
+
2003.12.02 skipping unnecessary subtrees added (_checksub)
|
51
|
+
2003.12.03 bug in ist_check for rule mining fixed
|
52
|
+
2003.12.12 padding for 64 bit architecture added
|
53
|
+
2004.05.09 additional selection measure for sets added
|
54
|
+
2004.12.09 bug in add. evaluation measure for sets fixed
|
55
|
+
2006.11.26 support parameter changed to an absolute value
|
56
|
+
2007.02.07 bug in function ist_addlvl / _child fixed
|
57
|
+
2008.01.25 bug in filtering closed/maximal item sets fixed
|
58
|
+
2008.03.13 additional rule evaluation redesigned
|
59
|
+
2008.03.24 creation based on ITEMSET structure
|
60
|
+
----------------------------------------------------------------------*/
|
61
|
+
#include <stdio.h>
|
62
|
+
#include <stdlib.h>
|
63
|
+
#include <string.h>
|
64
|
+
#include <limits.h>
|
65
|
+
#include <float.h>
|
66
|
+
#include <math.h>
|
67
|
+
#include <assert.h>
|
68
|
+
#include "istree.h"
|
69
|
+
#include "chi2.h"
|
70
|
+
#ifdef STORAGE
|
71
|
+
#include "storage.h"
|
72
|
+
#endif
|
73
|
+
|
74
|
+
/*----------------------------------------------------------------------
|
75
|
+
Preprocessor Definitions
|
76
|
+
----------------------------------------------------------------------*/
|
77
|
+
#define LN_2 0.69314718055994530942 /* ln(2) */
|
78
|
+
#define EPSILON 1e-12 /* to cope with roundoff errors */
|
79
|
+
#define BLKSIZE 32 /* block size for level vector */
|
80
|
+
#define F_HDONLY INT_MIN /* flag for head only item in path */
|
81
|
+
#define F_SKIP INT_MIN /* flag for subtree skipping */
|
82
|
+
#define ID(n) ((int)((n)->id & ~F_HDONLY))
|
83
|
+
#define HDONLY(n) ((int)((n)->id & F_HDONLY))
|
84
|
+
#define COUNT(n) ((n) & ~F_SKIP)
|
85
|
+
|
86
|
+
/*----------------------------------------------------------------------
|
87
|
+
Type Definitions
|
88
|
+
----------------------------------------------------------------------*/
|
89
|
+
typedef double EVALFN (int set, int body, int head, int n);
|
90
|
+
/* function to compute an additional evaluation measure */
|
91
|
+
|
92
|
+
/*----------------------------------------------------------------------
|
93
|
+
Auxiliary Functions
|
94
|
+
----------------------------------------------------------------------*/
|
95
|
+
|
96
|
+
static int _bsearch (int *vec, int n, int id)
|
97
|
+
{ /* --- binary search for an item */
|
98
|
+
int i, k; /* left and middle index */
|
99
|
+
|
100
|
+
assert(vec && (n > 0)); /* check the function arguments */
|
101
|
+
for (i = 0; i < n; ) { /* while the range is not empty */
|
102
|
+
k = (i + n) >> 1; /* get index of middle element */
|
103
|
+
if (vec[k] > id) n = k;
|
104
|
+
else if (vec[k] < id) i = k+1;
|
105
|
+
else return k; /* adapt range boundaries or return */
|
106
|
+
} /* the index the id. was found at */
|
107
|
+
return -1; /* return 'not found' */
|
108
|
+
} /* _bsearch() */
|
109
|
+
|
110
|
+
/*--------------------------------------------------------------------*/
|
111
|
+
|
112
|
+
static void _count (ISNODE *node, int *set, int cnt, int min)
|
113
|
+
{ /* --- count transaction recursively */
|
114
|
+
int i; /* vector index */
|
115
|
+
int *map, n; /* identifier map and its size */
|
116
|
+
ISNODE **vec; /* child node vector */
|
117
|
+
|
118
|
+
assert(node /* check the function arguments */
|
119
|
+
&& (cnt >= 0) && (set || (cnt <= 0)));
|
120
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
121
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
122
|
+
n = node->offset; /* get the index offset */
|
123
|
+
while ((cnt > 0) && (*set < n)) {
|
124
|
+
cnt--; set++; } /* skip items before first counter */
|
125
|
+
while (--cnt >= 0) { /* traverse the transaction's items */
|
126
|
+
i = *set++ -n; /* compute counter vector index */
|
127
|
+
if (i >= node->size) return;
|
128
|
+
node->cnts[i]++; /* if the counter exists, */
|
129
|
+
} } /* count the transaction */
|
130
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
131
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
132
|
+
n = ID(vec[0]); /* get the child node vector */
|
133
|
+
min--; /* one item less to the deepest nodes */
|
134
|
+
while ((cnt > min) && (*set < n)) {
|
135
|
+
cnt--; set++; } /* skip items before first child */
|
136
|
+
while (--cnt >= min) { /* traverse the transaction's items */
|
137
|
+
i = *set++ -n; /* compute child vector index */
|
138
|
+
if (i >= node->chcnt) return;
|
139
|
+
if (vec[i]) _count(vec[i], set, cnt, min);
|
140
|
+
} /* if the child exists, */
|
141
|
+
} } /* count the transaction recursively */
|
142
|
+
else { /* if an identifer map is used */
|
143
|
+
map = node->cnts +(n = node->size);
|
144
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
145
|
+
while (--cnt >= 0) { /* traverse the transaction's items */
|
146
|
+
if (*set > map[n-1]) return; /* if beyond last item, abort */
|
147
|
+
i = _bsearch(map, n, *set++);
|
148
|
+
if (i >= 0) node->cnts[i]++;
|
149
|
+
} } /* find index and count transaction */
|
150
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
151
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
152
|
+
if (node->chcnt < n) /* if a secondary id. map exists */
|
153
|
+
map = (int*)(vec +(n = node->chcnt));
|
154
|
+
min--; /* one item less to the deepest nodes */
|
155
|
+
while (--cnt >= min) { /* traverse the transaction's items */
|
156
|
+
if (*set > map[n-1]) return; /* if beyond last item, abort */
|
157
|
+
i = _bsearch(map, n, *set++);
|
158
|
+
if ((i >= 0) && vec[i]) _count(vec[i], set, cnt, min);
|
159
|
+
} /* search for the proper index */
|
160
|
+
} /* and if the child exists, */
|
161
|
+
} /* count the transaction recursively */
|
162
|
+
} /* _count() */
|
163
|
+
|
164
|
+
/*--------------------------------------------------------------------*/
|
165
|
+
|
166
|
+
static void _countx (ISNODE *node, TATREE *tat, int min)
|
167
|
+
{ /* --- count transa. tree recursively */
|
168
|
+
int i, k; /* vector index, loop variable */
|
169
|
+
int *map, n; /* identifier map and its size */
|
170
|
+
ISNODE **vec; /* child node vector */
|
171
|
+
|
172
|
+
assert(node && tat); /* check the function arguments */
|
173
|
+
if (tat_max(tat) < min) /* if the transactions are too short, */
|
174
|
+
return; /* abort the recursion */
|
175
|
+
k = tat_size(tat); /* get the number of children */
|
176
|
+
if (k <= 0) { /* if there are no children */
|
177
|
+
if (k < 0) _count(node, tat_items(tat), -k, min);
|
178
|
+
return; /* count the normal transaction */
|
179
|
+
} /* and abort the function */
|
180
|
+
while (--k >= 0) /* count the transactions recursively */
|
181
|
+
_countx(node, tat_child(tat, k), min);
|
182
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
183
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
184
|
+
n = node->offset; /* get the index offset */
|
185
|
+
for (k = tat_size(tat); --k >= 0; ) {
|
186
|
+
i = tat_item(tat,k) -n; /* traverse the items */
|
187
|
+
if (i < 0) return; /* if before first item, abort */
|
188
|
+
if (i < node->size) /* if inside the counter range */
|
189
|
+
node->cnts[i] += tat_cnt(tat_child(tat, k));
|
190
|
+
} } /* count the transaction */
|
191
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
192
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
193
|
+
n = ID(vec[0]); /* get the child node vector */
|
194
|
+
min--; /* one item less to the deepest nodes */
|
195
|
+
for (k = tat_size(tat); --k >= 0; ) {
|
196
|
+
i = tat_item(tat,k) -n; /* traverse the items */
|
197
|
+
if (i < 0) return; /* if before first item, abort */
|
198
|
+
if ((i < node->chcnt) && vec[i])
|
199
|
+
_countx(vec[i], tat_child(tat, k), min);
|
200
|
+
} /* if the child exists, */
|
201
|
+
} } /* count the transaction recursively */
|
202
|
+
else { /* if an identifer map is used */
|
203
|
+
map = node->cnts +(n = node->size);
|
204
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
205
|
+
for (k = tat_size(tat); --k >= 0; ) {
|
206
|
+
i = tat_item(tat, k); /* get the next item */
|
207
|
+
if (i < map[0]) return; /* if before first item, abort */
|
208
|
+
i = _bsearch(map, n, i);
|
209
|
+
if (i >= 0) node->cnts[i] += tat_cnt(tat_child(tat, k));
|
210
|
+
} } /* find index and count transaction */
|
211
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
212
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
213
|
+
if (node->chcnt < n) /* if a secondary id. map exists */
|
214
|
+
map = (int*)(vec +(n = node->chcnt));
|
215
|
+
min--; /* one item less to the deepest nodes */
|
216
|
+
for (k = tat_size(tat); --k >= 0; ) {
|
217
|
+
i = tat_item(tat, k); /* get the next item */
|
218
|
+
if (i < map[0]) return; /* if before first item, abort */
|
219
|
+
i = _bsearch(map, n, i);
|
220
|
+
if ((i >= 0) && vec[i]) _countx(vec[i], tat_child(tat, k), min);
|
221
|
+
} /* search for the proper index */
|
222
|
+
} /* and if the child exists, */
|
223
|
+
} /* count the transaction recursively */
|
224
|
+
} /* _countx() */
|
225
|
+
|
226
|
+
/*--------------------------------------------------------------------*/
|
227
|
+
|
228
|
+
static int _checksub (ISNODE *node)
|
229
|
+
{ /* --- recursively check subtrees */
|
230
|
+
int i, r; /* vector index, result */
|
231
|
+
ISNODE **vec; /* child node vector */
|
232
|
+
|
233
|
+
assert(node); /* check the function argument */
|
234
|
+
if (node->chcnt == 0) return 0; /* do not skip new leaves */
|
235
|
+
if (node->chcnt < 0) return -1; /* skip marked subtrees */
|
236
|
+
if (node->offset >= 0) /* if a pure vector is used */
|
237
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
238
|
+
else /* if an identifer map is used */
|
239
|
+
vec = (ISNODE**)(node->cnts +node->size +node->size);
|
240
|
+
for (r = -1, i = node->chcnt; --i >= 0; )
|
241
|
+
if (vec[i]) r &= _checksub(vec[i]);
|
242
|
+
if (!r) return 0; /* recursively check all children */
|
243
|
+
node->chcnt |= F_SKIP; /* set the skip flag if possible */
|
244
|
+
return -1; /* return 'subtree can be skipped' */
|
245
|
+
} /* _checksub() */
|
246
|
+
|
247
|
+
/*--------------------------------------------------------------------*/
|
248
|
+
|
249
|
+
static int _checkuse (ISNODE *node, char *marks, int supp)
|
250
|
+
{ /* --- recursively check item usage */
|
251
|
+
int i, r = 0; /* vector index, result of check */
|
252
|
+
int *map, n; /* identifier map and its size */
|
253
|
+
ISNODE **vec; /* child node vector */
|
254
|
+
|
255
|
+
assert(node && marks); /* check the function arguments */
|
256
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
257
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
258
|
+
n = node->offset; /* get the index offset */
|
259
|
+
for (i = node->size; --i >= 0; ) {
|
260
|
+
if (node->cnts[i] >= supp)
|
261
|
+
marks[n+i] = r = 1; /* mark items in set that satisfies */
|
262
|
+
} } /* the minimum support criterion */
|
263
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
264
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
265
|
+
for (i = node->chcnt; --i >= 0; )
|
266
|
+
if (vec[i]) r |= _checkuse(vec[i], marks, supp);
|
267
|
+
} } /* recursively process all children */
|
268
|
+
else { /* if an identifer map is used */
|
269
|
+
map = node->cnts +node->size;
|
270
|
+
if (node->chcnt == 0) { /* if this is a new node */
|
271
|
+
for (i = node->size; --i >= 0; ) {
|
272
|
+
if (node->cnts[i] >= supp)
|
273
|
+
marks[map[i]] = r = 1;/* mark items in set that satisfies */
|
274
|
+
} } /* the minimum support criterion */
|
275
|
+
else if (node->chcnt > 0) { /* if there are child nodes */
|
276
|
+
vec = (ISNODE**)(map +node->size);
|
277
|
+
for (i = node->chcnt; --i >= 0; )
|
278
|
+
if (vec[i]) r |= _checkuse(vec[i], marks, supp);
|
279
|
+
} /* get the child vector and */
|
280
|
+
} /* recursively process all children */
|
281
|
+
if ((r != 0) && node->parent) /* if the check succeeded, mark */
|
282
|
+
marks[ID(node)] = 1; /* the item associated with the node */
|
283
|
+
return r; /* return the check result */
|
284
|
+
} /* _checkuse() */
|
285
|
+
|
286
|
+
/*--------------------------------------------------------------------*/
|
287
|
+
|
288
|
+
static int _getsupp (ISNODE *node, int *set, int cnt)
|
289
|
+
{ /* --- get support of an item set */
|
290
|
+
int i, n, c; /* vector index, buffers */
|
291
|
+
int *map; /* identifier map */
|
292
|
+
ISNODE **vec; /* vector of child nodes */
|
293
|
+
|
294
|
+
assert(node && set && (cnt >= 0)); /* check the function arguments */
|
295
|
+
while (--cnt > 0) { /* follow the set/path from the node */
|
296
|
+
c = node->chcnt & ~F_SKIP; /* if there are no children, */
|
297
|
+
if (c <= 0) return -1; /* the support is less than minsupp */
|
298
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
299
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
300
|
+
i = *set++ -ID(vec[0]); /* compute the child vector index and */
|
301
|
+
if (i >= c) return -1; } /* abort if the child does not exist */
|
302
|
+
else { /* if an identifier map is used */
|
303
|
+
map = node->cnts +(n = node->size);
|
304
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
305
|
+
if (c < n) /* if a secondary id. map exists, */
|
306
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
307
|
+
i = _bsearch(map, n, *set++);
|
308
|
+
} /* search for the proper index */
|
309
|
+
if (i < 0) return -1; /* abort if index is out of range */
|
310
|
+
node = vec[i]; /* go to the corresponding child */
|
311
|
+
if (!node) return -1; /* if the child does not exists, */
|
312
|
+
} /* the support is less than minsupp */
|
313
|
+
if (node->offset >= 0) { /* if a pure vector is used, */
|
314
|
+
i = *set -node->offset; /* compute the counter index */
|
315
|
+
if (i >= node->size) return -1; }
|
316
|
+
else { /* if an identifier map is used */
|
317
|
+
map = node->cnts +(n = node->size);
|
318
|
+
i = _bsearch(map, n, *set);
|
319
|
+
} /* search for the proper index */
|
320
|
+
if (i < 0) return -1; /* abort if index is out of range */
|
321
|
+
return node->cnts[i]; /* return the item set support */
|
322
|
+
} /* _getsupp() */
|
323
|
+
|
324
|
+
/*--------------------------------------------------------------------*/
|
325
|
+
|
326
|
+
static void _marksupp (ISNODE *node, int *set, int cnt, int supp)
|
327
|
+
{ /* --- mark support of an item set */
|
328
|
+
int i, n, c; /* vector index, buffers */
|
329
|
+
int *map; /* identifier map */
|
330
|
+
ISNODE **vec; /* vector of child nodes */
|
331
|
+
|
332
|
+
assert(node && set && (cnt >= 0)); /* check the function arguments */
|
333
|
+
while (--cnt > 0) { /* follow the set/path from the node */
|
334
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
335
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
336
|
+
i = *set++ -ID(vec[0]);}/* compute the child vector index */
|
337
|
+
else { /* if an identifier map is used */
|
338
|
+
map = node->cnts +(n = node->size);
|
339
|
+
vec = (ISNODE**)(map +n); /* get id. map, child vector and */
|
340
|
+
c = node->chcnt & ~F_SKIP; /* the number of children */
|
341
|
+
if (c < n) /* if a secondary id. map exists, */
|
342
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
343
|
+
i = _bsearch(map, n, *set++);
|
344
|
+
} /* search for the proper index */
|
345
|
+
node = vec[i]; /* go to the corresponding child */
|
346
|
+
}
|
347
|
+
if (node->offset >= 0) /* if a pure vector is used, */
|
348
|
+
i = *set -node->offset; /* compute the counter index */
|
349
|
+
else { /* if an identifier map is used */
|
350
|
+
map = node->cnts +(n = node->size);
|
351
|
+
i = _bsearch(map, n, *set);
|
352
|
+
} /* search for the proper index */
|
353
|
+
if ((supp < 0) /* if to clear unconditionally */
|
354
|
+
|| (node->cnts[i] == supp)) /* or the support is the same */
|
355
|
+
node->cnts[i] |= F_SKIP; /* mark support as cleared */
|
356
|
+
} /* _marksupp() */
|
357
|
+
|
358
|
+
/*--------------------------------------------------------------------*/
|
359
|
+
|
360
|
+
static void _marksub (ISTREE *ist, ISNODE *node, int index, int supp)
|
361
|
+
{ /* --- mark all n-1 subsets */
|
362
|
+
int i; /* next item, loop variable */
|
363
|
+
int *set; /* (partial) item set */
|
364
|
+
|
365
|
+
if (node->offset >= 0) i = node->offset +index;
|
366
|
+
else i = node->cnts[node->size +index];
|
367
|
+
set = ist->buf +ist->vsz; /* get and store the first two items */
|
368
|
+
*--set = i; _marksupp(node->parent, set, 1, supp);
|
369
|
+
*--set = ID(node); _marksupp(node->parent, set, 1, supp);
|
370
|
+
i = 2; /* mark counters in parent node */
|
371
|
+
for (node = node->parent; node->parent; node = node->parent) {
|
372
|
+
_marksupp(node->parent, set, i, supp);
|
373
|
+
*--set = ID(node); i++; /* climb up the tree and mark */
|
374
|
+
} /* counters for all n-1 subsets */
|
375
|
+
} /* _marksub() */
|
376
|
+
|
377
|
+
/*--------------------------------------------------------------------*/
|
378
|
+
|
379
|
+
static ISNODE* _child (ISTREE *ist, ISNODE *node, int index,
|
380
|
+
int s_min, int s_body)
|
381
|
+
{ /* --- create child node (extend set) */
|
382
|
+
int i, k, n; /* loop variables, counters */
|
383
|
+
ISNODE *curr; /* to traverse the path to the root */
|
384
|
+
int item, cnt; /* item identifier, number of items */
|
385
|
+
int *set; /* next (partial) item set to check */
|
386
|
+
int body; /* enough support for a rule body */
|
387
|
+
int hdonly; /* whether head only item on path */
|
388
|
+
int app; /* appearance flags of an item */
|
389
|
+
int s_set; /* support of an item set */
|
390
|
+
|
391
|
+
assert(ist && node /* check the function arguments */
|
392
|
+
&& (index >= 0) && (index < node->size));
|
393
|
+
if (node->offset >= 0) item = node->offset +index;
|
394
|
+
else item = node->cnts[node->size +index];
|
395
|
+
app = is_getapp(ist->set, item); /* get item id. and app. flag */
|
396
|
+
if ((app == IST_IGNORE) /* do not extend an item to ignore */
|
397
|
+
|| ((HDONLY(node) && (app == IST_HEAD))))
|
398
|
+
return NULL; /* nor a set with two head only items */
|
399
|
+
hdonly = HDONLY(node) || (app == IST_HEAD);
|
400
|
+
|
401
|
+
/* --- initialize --- */
|
402
|
+
s_set = node->cnts[index]; /* get support of item set to extend */
|
403
|
+
if (s_set < s_min) /* if set support is insufficient, */
|
404
|
+
return NULL; /* no child is needed, so abort */
|
405
|
+
body = (s_set >= s_body) /* if the set has enough support for */
|
406
|
+
? 1 : 0; /* a rule body, set the body flag */
|
407
|
+
ist->buf[ist->vsz -2] = item; /* init. set for support checks */
|
408
|
+
|
409
|
+
/* --- check candidates --- */
|
410
|
+
for (n = 0, i = index; ++i < node->size; ) {
|
411
|
+
if (node->offset >= 0) k = node->offset +i;
|
412
|
+
else k = node->cnts[node->size +i];
|
413
|
+
app = is_getapp(ist->set, k); /* traverse the candidate items */
|
414
|
+
if ((app == IST_IGNORE) || (hdonly && (app == IST_HEAD)))
|
415
|
+
continue; /* skip sets with two head only items */
|
416
|
+
s_set = node->cnts[i]; /* traverse the candidate items */
|
417
|
+
if (s_set < s_min) /* if set support is insufficient, */
|
418
|
+
continue; /* ignore the corresponding candidate */
|
419
|
+
body &= 1; /* restrict body flags to the set S */
|
420
|
+
if (s_set >= s_body) /* if set support is sufficient for */
|
421
|
+
body |= 2; /* a rule body, set the body flag */
|
422
|
+
set = ist->buf +ist->vsz -(cnt = 2);
|
423
|
+
set[1] = k; /* add the candidate item to the set */
|
424
|
+
for (curr = node; curr->parent; curr = curr->parent) {
|
425
|
+
s_set = _getsupp(curr->parent, set, cnt);
|
426
|
+
if (s_set < s_min) /* get the item set support and */
|
427
|
+
break; /* if it is too low, abort the loop */
|
428
|
+
if (s_set >= s_body) /* if some subset has enough support */
|
429
|
+
body |= 4; /* for a rule body, set the body flag */
|
430
|
+
*--set = ID(curr); cnt++; /* add id of current node to the set */
|
431
|
+
} /* and adapt the number of items */
|
432
|
+
if (!curr->parent && body) /* if subset support is high enough */
|
433
|
+
ist->map[n++] = k; /* for a full rule and a rule body, */
|
434
|
+
} /* note the item identifier */
|
435
|
+
if (n <= 0) return NULL; /* if no child is needed, abort */
|
436
|
+
#ifdef BENCH /* if benchmark version: */
|
437
|
+
ist->scnec += n; /* sum the necessary counters */
|
438
|
+
#endif
|
439
|
+
|
440
|
+
/* --- decide on node structure --- */
|
441
|
+
k = ist->map[n-1] -ist->map[0] +1;
|
442
|
+
if (!(ist->mode & IST_MEMOPT)) n = k;
|
443
|
+
else if (3*n >= 2*k) n = k; /* use a pure vector if it is small */
|
444
|
+
else k = n+n; /* enough, otherwise use an id. map */
|
445
|
+
#ifdef ARCH64 /* if 64 bit architecture */
|
446
|
+
if ((n == k) && (k & 1)) n = ++k;
|
447
|
+
#endif /* pad to even number of counters */
|
448
|
+
#ifdef BENCH /* if benchmark version */
|
449
|
+
ist->sccnt += n; /* sum the number of counters */
|
450
|
+
ist->bytes += sizeof(ISNODE) +(k-1) *sizeof(int) +8;
|
451
|
+
#endif /* determine the memory usage */
|
452
|
+
|
453
|
+
/* --- create child --- */
|
454
|
+
curr = (ISNODE*)malloc(sizeof(ISNODE) +(k-1) *sizeof(int));
|
455
|
+
if (!curr) return (void*)-1; /* create a child node */
|
456
|
+
curr->parent = node; /* set pointer to parent node */
|
457
|
+
curr->succ = NULL; /* and clear successor pointer */
|
458
|
+
curr->id = item; /* initialize the item id. and */
|
459
|
+
if (hdonly) curr->id |= F_HDONLY; /* set the head only flag */
|
460
|
+
curr->chcnt = 0; /* there are no children yet */
|
461
|
+
curr->size = n; /* set size of counter vector */
|
462
|
+
if (n == k) /* if to use a pure vector, */
|
463
|
+
curr->offset = ist->map[0]; /* note the first item as an offset */
|
464
|
+
else { /* if to use an identifier map, */
|
465
|
+
curr->offset = -1; /* use the offset as an indicator */
|
466
|
+
for (set = curr->cnts +n +(i = n); --i >= 0; )
|
467
|
+
*--set = ist->map[i]; /* copy the identifier map */
|
468
|
+
} /* from the buffer to the node */
|
469
|
+
for (set = curr->cnts +(i = n); --i >= 0; )
|
470
|
+
*--set = 0; /* clear all counters of the node */
|
471
|
+
return curr; /* return pointer to created child */
|
472
|
+
} /* _child() */
|
473
|
+
|
474
|
+
/*----------------------------------------------------------------------
|
475
|
+
In the above function the set S represented by the index-th vector
|
476
|
+
element of the current node is extended only by combining it with the
|
477
|
+
sets represented by the fields that follow it in the node vector,
|
478
|
+
i.e. by the sets represented by vec[index+1] to vec[size-1]. The sets
|
479
|
+
that can be formed by combining the set S and the sets represented by
|
480
|
+
vec[0] to vec[index-1] are processed in the branches for these sets.
|
481
|
+
In the 'check candidates' loop it is checked for each set represented
|
482
|
+
by vec[index+1] to vec[size-1] whether this set and all other subsets
|
483
|
+
of the same size, which can be formed from the union of this set and
|
484
|
+
the set S, have enough support, so that a child node is necessary.
|
485
|
+
Note that i +offset is the identifier of the item that has to be
|
486
|
+
added to set S to form the union of the set S and the set T represented
|
487
|
+
by vec[i], since S and T have the same path with the exception of the
|
488
|
+
index in the current node. Hence we can speak of candidate items that
|
489
|
+
are added to S.
|
490
|
+
Checking the support of the other subsets of the union of S and T
|
491
|
+
that have the same size as S and T is done with the aid of a path
|
492
|
+
variable. The items in this variable combined with the items on the
|
493
|
+
path to the current node always represent the subset currently tested.
|
494
|
+
That is, the path variable holds the path to be followed from the
|
495
|
+
current node to arrive at the support counter for the subset. The path
|
496
|
+
variable is initialized to [0]: <item>, [1]: <offset+i>, since the
|
497
|
+
support counters for S and T can be inspected directly. Then this
|
498
|
+
path is followed from the parent node of the current node, which is
|
499
|
+
equivalent to checking the subset that can be obtained by removing
|
500
|
+
from the union of S and T the item that corresponds to the parent node
|
501
|
+
(in the path to S or T, resp.).
|
502
|
+
Iteratively making the parent node the current node, adding its
|
503
|
+
corresponding item to the path and checking the support counter at the
|
504
|
+
end of the path variable when starting from its (the new current node's)
|
505
|
+
parent node tests all other subsets.
|
506
|
+
Another criterion is that the extended set must not contain two items
|
507
|
+
which may appear only in the head of a rule. If two such items are
|
508
|
+
contained in a set, neither can a rule be formed from its items nor can
|
509
|
+
it be the antecedent of a rule. Whether a set contains two head only
|
510
|
+
items is determined from the nodes 'hdonly' flag and the appearance
|
511
|
+
flags of the items.
|
512
|
+
----------------------------------------------------------------------*/
|
513
|
+
|
514
|
+
static void _cleanup (ISTREE *ist)
|
515
|
+
{ /* --- clean up on error */
|
516
|
+
ISNODE *node, *t; /* to traverse the nodes */
|
517
|
+
|
518
|
+
assert(ist); /* check the function argument */
|
519
|
+
for (node = ist->lvls[ist->height]; node; ) {
|
520
|
+
t = node; node = node->succ; free(t); }
|
521
|
+
ist->lvls[ist->height] = NULL;/* delete all created nodes */
|
522
|
+
for (node = ist->lvls[ist->height -1]; node; node = node->succ)
|
523
|
+
node->chcnt = 0; /* clear the child node counters */
|
524
|
+
} /* _cleanup() */ /* of the deepest nodes in the tree */
|
525
|
+
|
526
|
+
/*----------------------------------------------------------------------
|
527
|
+
Additional Rule Evaluation Measure Functions
|
528
|
+
----------------------------------------------------------------------*/
|
529
|
+
|
530
|
+
static double _none (int set, int body, int head, int n)
|
531
|
+
{ return 1; } /* --- no add. evaluation measure */
|
532
|
+
|
533
|
+
/*--------------------------------------------------------------------*/
|
534
|
+
|
535
|
+
static double _diff (int set, int body, int head, int n)
|
536
|
+
{ /* --- absolute confidence difference */
|
537
|
+
return fabs(head/(double)n -set/(double)body);
|
538
|
+
} /* _diff() */
|
539
|
+
|
540
|
+
/*--------------------------------------------------------------------*/
|
541
|
+
|
542
|
+
static double _quot (int set, int body, int head, int n)
|
543
|
+
{ /* --- diff. of conf. quotient to 1 */
|
544
|
+
double t; /* temporary buffer */
|
545
|
+
|
546
|
+
if ((head <= 0) || (body <= 0)) return 0;
|
547
|
+
t = (set/(double)body) /(head/(double)n);
|
548
|
+
return 1 -((t > 1) ? 1/t : t); /* return the confidence quotient */
|
549
|
+
} /* _quot() */
|
550
|
+
|
551
|
+
/*--------------------------------------------------------------------*/
|
552
|
+
|
553
|
+
static double _aimp (int set, int body, int head, int n)
|
554
|
+
{ /* --- abs. diff. of improvement to 1 */
|
555
|
+
if ((head <= 0) || (body <= 0)) return 0;
|
556
|
+
return fabs((set/(double)body) /(head/(double)n) -1);
|
557
|
+
} /* _aimp() */
|
558
|
+
|
559
|
+
/*--------------------------------------------------------------------*/
|
560
|
+
|
561
|
+
static double _info (int set, int body, int head, int n)
|
562
|
+
{ /* --- information diff. to prior */
|
563
|
+
double sum, t; /* result, temporary buffer */
|
564
|
+
|
565
|
+
if ((head <= 0) || (head >= n)
|
566
|
+
|| (body <= 0) || (body >= n))
|
567
|
+
return 0; /* check for strict positivity */
|
568
|
+
sum = 0; /* support of head and body */
|
569
|
+
if (set > 0) sum += set *log(set /( head *(double) body));
|
570
|
+
t = body -set; /* support of not head and body */
|
571
|
+
if (t > 0) sum += t *log(t /((n-head) *(double) body));
|
572
|
+
t = head -set; /* support of head and not body */
|
573
|
+
if (t > 0) sum += t *log(t /( head *(double)(n-body)));
|
574
|
+
t = n -head -body +set; /* support of not head and not body */
|
575
|
+
if (t > 0) sum += t *log(t /((n-head) *(double)(n-body)));
|
576
|
+
return (log(n) +sum/n) /LN_2; /* return information gain in bits */
|
577
|
+
} /* _info() */
|
578
|
+
|
579
|
+
/*--------------------------------------------------------------------*/
|
580
|
+
|
581
|
+
static double _chi2 (int set, int body, int head, int n)
|
582
|
+
{ /* --- normalized chi^2 measure */
|
583
|
+
double t; /* temporary buffer */
|
584
|
+
|
585
|
+
if ((head <= 0) || (head >= n)
|
586
|
+
|| (body <= 0) || (body >= n))
|
587
|
+
return 0; /* check for strict positivity */
|
588
|
+
t = head *(double)body -set *(double)n;
|
589
|
+
return (t*t) / (((double)head) *(n-head) *body *(n-body));
|
590
|
+
} /* _chi2() */ /* compute and return chi^2 measure */
|
591
|
+
|
592
|
+
/*--------------------------------------------------------------------*/
|
593
|
+
|
594
|
+
static double _pval (int set, int body, int head, int n)
|
595
|
+
{ /* --- p-value from chi^2 measure */
|
596
|
+
return chi2cdf(n*_chi2(set, body, head, n), 1);
|
597
|
+
} /* _pval() */
|
598
|
+
|
599
|
+
/*--------------------------------------------------------------------*/
|
600
|
+
|
601
|
+
static EVALFN *_evalfns[EM_UNKNOWN] = {
|
602
|
+
/* EM_NONE 0 */ _none, /* no additional evaluation measure */
|
603
|
+
/* EM_DIFF 1 */ _diff, /* absolute confidence difference */
|
604
|
+
/* EM_QUOT 2 */ _quot, /* difference of conf. quotient to 1 */
|
605
|
+
/* EM_AIMP 3 */ _aimp, /* abs. diff. of improvement to 1 */
|
606
|
+
/* EM_INFO 4 */ _info, /* information difference to prior */
|
607
|
+
/* EM_CHI2 5 */ _chi2, /* normalized chi^2 measure */
|
608
|
+
/* EM_PVAL 6 */ _pval, /* p-value of chi^2 measure */
|
609
|
+
}; /* table of evaluation functions */
|
610
|
+
|
611
|
+
/*----------------------------------------------------------------------
|
612
|
+
Main Functions
|
613
|
+
----------------------------------------------------------------------*/
|
614
|
+
|
615
|
+
ISTREE* ist_create (ITEMSET *set, int mode, int supp, double conf)
|
616
|
+
{ /* --- create an item set tree */
|
617
|
+
int cnt, n; /* number of items, buffer */
|
618
|
+
ISTREE *ist; /* created item set tree */
|
619
|
+
ISNODE *root; /* root node of the tree */
|
620
|
+
|
621
|
+
assert(set /* check the function arguments */
|
622
|
+
&& (supp >= 0) && (conf >= 0) && (conf <= 1));
|
623
|
+
|
624
|
+
/* --- allocate memory --- */
|
625
|
+
cnt = is_cnt(set); /* get the number of items */
|
626
|
+
ist = (ISTREE*)malloc(sizeof(ISTREE));
|
627
|
+
if (!ist) return NULL; /* allocate the tree body */
|
628
|
+
ist->lvls = (ISNODE**)malloc(BLKSIZE *sizeof(ISNODE*));
|
629
|
+
if (!ist->lvls) { free(ist); return NULL; }
|
630
|
+
ist->buf = (int*) malloc(BLKSIZE *sizeof(int));
|
631
|
+
if (!ist->buf) { free(ist->lvls); free(ist); return NULL; }
|
632
|
+
ist->map = (int*) malloc(cnt *sizeof(int));
|
633
|
+
if (!ist->map) { free(ist->buf);
|
634
|
+
free(ist->lvls); free(ist); return NULL; }
|
635
|
+
#ifdef ARCH64 /* if 64 bit architecture, */
|
636
|
+
n = cnt +(cnt & 1); /* pad counters to even number */
|
637
|
+
#else /* on 32 bit systems, however, */
|
638
|
+
n = cnt; /* use the number of items directly */
|
639
|
+
#endif
|
640
|
+
ist->lvls[0] = ist->curr = /* allocate a root node */
|
641
|
+
root = (ISNODE*)calloc(1, sizeof(ISNODE) +(n-1) *sizeof(int));
|
642
|
+
if (!root) { free(ist->map); free(ist->buf);
|
643
|
+
free(ist->lvls); free(ist); return NULL; }
|
644
|
+
|
645
|
+
/* --- initialize structures --- */
|
646
|
+
ist->set = set; /* copy parameters to the structure */
|
647
|
+
ist->mode = mode;
|
648
|
+
ist->tacnt = is_gettac(set);
|
649
|
+
ist->vsz = BLKSIZE;
|
650
|
+
ist->height = 1;
|
651
|
+
ist->rule = (supp > 0) ? supp : 1;
|
652
|
+
if (mode & IST_HEAD) supp = (int)ceil(conf *supp);
|
653
|
+
ist->supp = (supp > 0) ? supp : 1;
|
654
|
+
ist->conf = conf;
|
655
|
+
#ifdef BENCH /* if benchmark version */
|
656
|
+
ist->sccnt = ist->scnec = cnt;
|
657
|
+
ist->cpcnt = ist->cpnec = 0;
|
658
|
+
ist->bytes = sizeof(ISTREE) +cnt *sizeof(char) +8
|
659
|
+
+ BLKSIZE *sizeof(ISNODE*) +8
|
660
|
+
+ BLKSIZE *sizeof(int) +8
|
661
|
+
+ cnt *sizeof(int) +8;
|
662
|
+
#endif /* initialize the benchmark variables */
|
663
|
+
ist_init(ist, 1, EM_NONE, 1); /* initialize rule extraction */
|
664
|
+
root->parent = root->succ = NULL;
|
665
|
+
root->offset = root->id = 0;
|
666
|
+
root->chcnt = 0; /* initialize the root node */
|
667
|
+
root->size = n;
|
668
|
+
while (--cnt >= 0) /* copy the item frequencies */
|
669
|
+
root->cnts[cnt] = is_getfrq(set, cnt);
|
670
|
+
return ist; /* return created item set tree */
|
671
|
+
} /* ist_create() */
|
672
|
+
|
673
|
+
/*--------------------------------------------------------------------*/
|
674
|
+
|
675
|
+
void ist_delete (ISTREE *ist)
|
676
|
+
{ /* --- delete an item set tree */
|
677
|
+
int i; /* loop variables */
|
678
|
+
ISNODE *node, *t; /* to traverse the nodes */
|
679
|
+
|
680
|
+
assert(ist); /* check the function argument */
|
681
|
+
for (i = ist->height; --i >= 0; ) {
|
682
|
+
for (node = ist->lvls[i]; node; ) {
|
683
|
+
t = node; node = node->succ; free(t); }
|
684
|
+
} /* delete all nodes, */
|
685
|
+
free(ist->lvls); /* the level vector, */
|
686
|
+
free(ist->map); /* the identifier map, */
|
687
|
+
free(ist->buf); /* the path buffer, */
|
688
|
+
free(ist); /* and the tree body */
|
689
|
+
} /* ist_delete() */
|
690
|
+
|
691
|
+
/*--------------------------------------------------------------------*/
|
692
|
+
|
693
|
+
void ist_count (ISTREE *ist, int *set, int cnt)
|
694
|
+
{ /* --- count transaction in tree */
|
695
|
+
assert(ist /* check the function arguments */
|
696
|
+
&& (cnt >= 0) && (set || (cnt <= 0)));
|
697
|
+
if (cnt >= ist->height) /* recursively count transaction */
|
698
|
+
_count(ist->lvls[0], set, cnt, ist->height);
|
699
|
+
} /* ist_count() */
|
700
|
+
|
701
|
+
/*--------------------------------------------------------------------*/
|
702
|
+
|
703
|
+
void ist_countx (ISTREE *ist, TATREE *tat)
|
704
|
+
{ /* --- count transaction in tree */
|
705
|
+
assert(ist && tat); /* check the function arguments */
|
706
|
+
_countx(ist->lvls[0], tat, ist->height);
|
707
|
+
} /* ist_countx() */ /* recursively count the trans. tree */
|
708
|
+
|
709
|
+
/*--------------------------------------------------------------------*/
|
710
|
+
|
711
|
+
int ist_check (ISTREE *ist, char *marks)
|
712
|
+
{ /* --- check item usage */
|
713
|
+
int i, n; /* loop variable, number of items */
|
714
|
+
|
715
|
+
assert(ist); /* check the function argument */
|
716
|
+
for (i = ist->lvls[0]->size; --i >= 0; )
|
717
|
+
marks[i] = 0; /* clear the marker vector */
|
718
|
+
_checkuse(ist->lvls[0], marks, ist->supp);
|
719
|
+
for (n = 0, i = ist->lvls[0]->size; --i >= 0; )
|
720
|
+
if (marks[i]) n++; /* count used items */
|
721
|
+
return n; /* and return this number */
|
722
|
+
} /* ist_check() */
|
723
|
+
|
724
|
+
/*--------------------------------------------------------------------*/
|
725
|
+
|
726
|
+
int ist_addlvl (ISTREE *ist)
|
727
|
+
{ /* --- add a level to item set tree */
|
728
|
+
int i, n, c; /* loop variable, counter, buffer */
|
729
|
+
ISNODE **ndp; /* to traverse the nodes */
|
730
|
+
ISNODE *node; /* new (reallocated) node */
|
731
|
+
ISNODE **end; /* end of new level node list */
|
732
|
+
ISNODE *cur; /* current node in new level */
|
733
|
+
ISNODE *frst; /* first child of current node */
|
734
|
+
ISNODE *last; /* last child of current node */
|
735
|
+
ISNODE **vec; /* child node vector */
|
736
|
+
int *map; /* identifier map */
|
737
|
+
void *p; /* temporary buffer */
|
738
|
+
|
739
|
+
assert(ist); /* check the function arguments */
|
740
|
+
|
741
|
+
/* --- enlarge level vector --- */
|
742
|
+
if (ist->height >= ist->vsz){ /* if the level vector is full */
|
743
|
+
n = ist->vsz +BLKSIZE; /* compute new vector size */
|
744
|
+
p = realloc(ist->lvls, n *sizeof(ISNODE*));
|
745
|
+
if (!p) return -1; /* enlarge the level vector */
|
746
|
+
ist->lvls = (ISNODE**)p; /* and set the new vector */
|
747
|
+
p = realloc(ist->buf, n *sizeof(int));
|
748
|
+
if (!p) return -1; /* enlarge the buffer vector */
|
749
|
+
ist->buf = (int*)p; /* and set the new vector */
|
750
|
+
ist->vsz = n; /* set the new vector size */
|
751
|
+
} /* (applies to buf and levels) */
|
752
|
+
end = ist->lvls +ist->height;
|
753
|
+
*end = NULL; /* start a new tree level */
|
754
|
+
|
755
|
+
/* --- add tree level --- */
|
756
|
+
for (ndp = ist->lvls +ist->height -1; *ndp; ndp = &(*ndp)->succ) {
|
757
|
+
frst = last = NULL; /* traverse the deepest nodes */
|
758
|
+
for (i = n = 0; i < (*ndp)->size; i++) {
|
759
|
+
cur = _child(ist, *ndp, i, ist->supp, ist->rule);
|
760
|
+
if (!cur) continue; /* create a child if necessary */
|
761
|
+
if (cur == (void*)-1) { _cleanup(ist); return -1; }
|
762
|
+
if (!frst) frst = cur; /* note first and last child node */
|
763
|
+
*end = last = cur; /* add node at the end of the list */
|
764
|
+
end = &cur->succ; n++; /* that contains the new level */
|
765
|
+
} /* and advance end pointer */
|
766
|
+
if (n <= 0) { /* if no child node was created, */
|
767
|
+
(*ndp)->chcnt = F_SKIP; continue; } /* skip the node */
|
768
|
+
#ifdef BENCH /* if benchmark version */
|
769
|
+
ist->cpnec += n; /* sum the number of necessary */
|
770
|
+
#endif /* child pointers */
|
771
|
+
node = *ndp; /* decide on the node structure: */
|
772
|
+
if (node->offset >= 0) { /* if a pure counter vector is used, */
|
773
|
+
n = ID(last)-ID(frst)+1; /* always add a pure child vector */
|
774
|
+
i = (node->size -1) *sizeof(int) +n *sizeof(ISNODE*); }
|
775
|
+
else if (2*n > node->size){ /* if a single id. map is best, */
|
776
|
+
n = node->size; /* only add a child vector */
|
777
|
+
i = (n+n-1) *sizeof(int) +n *sizeof(ISNODE*); }
|
778
|
+
else { /* if two identifier maps are best, */
|
779
|
+
i = node->size; /* add a child vector and a map */
|
780
|
+
i = (i+i-1) *sizeof(int) +n *(sizeof(ISNODE*) +sizeof(int));
|
781
|
+
} /* get size of additional vectors */
|
782
|
+
node = (ISNODE*)realloc(node, sizeof(ISNODE) +i);
|
783
|
+
if (!node) { _cleanup(ist); return -1; }
|
784
|
+
node->chcnt = n; /* add a child vector to the node */
|
785
|
+
#ifdef BENCH /* if benchmark version */
|
786
|
+
ist->cpcnt += n; /* sum the number of child pointers */
|
787
|
+
if ((node->offset >= 0) || (node->size == n))
|
788
|
+
ist->bytes += n * sizeof(ISNODE*);
|
789
|
+
else ist->bytes += n *(sizeof(ISNODE*) +sizeof(int));
|
790
|
+
#endif /* determine the memory usage */
|
791
|
+
if ((node != *ndp) && node->parent) {
|
792
|
+
last = node->parent; /* adapt the ref. from the parent */
|
793
|
+
if (last->offset >= 0) { /* if a pure vector is used */
|
794
|
+
vec = (ISNODE**)(last->cnts +last->size);
|
795
|
+
vec[(vec[0] != *ndp) ? ID(node) -ID(vec[0]) : 0] = node; }
|
796
|
+
else { /* if an identifier map is used */
|
797
|
+
map = last->cnts +(i = last->size);
|
798
|
+
vec = (ISNODE**)(map+i);/* get identifier map, child vector, */
|
799
|
+
c = last->chcnt & ~F_SKIP; /* and the number of children */
|
800
|
+
if (c < i) /* if a secondary id. map exists, */
|
801
|
+
map = (int*)(vec +(i = c)); /* get this identifier map */
|
802
|
+
vec[_bsearch(map, i, ID(node))] = node;
|
803
|
+
} /* find the proper index and */
|
804
|
+
} /* set the new child pointer */
|
805
|
+
*ndp = node; /* set new (reallocated) node */
|
806
|
+
if (node->offset >= 0) { /* if to use pure vectors */
|
807
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
808
|
+
while (--n >= 0) vec[n] = NULL;
|
809
|
+
i = ID(frst); /* get item identifier of first child */
|
810
|
+
for (cur = frst; cur; cur = cur->succ) {
|
811
|
+
vec[ID(cur)-i] = cur; /* set the child node pointer */
|
812
|
+
cur->parent = node; /* and the parent pointer */
|
813
|
+
} } /* in the new node */
|
814
|
+
else if (n < node->size) { /* if two identifier maps are used */
|
815
|
+
vec = (ISNODE**)(node->cnts +node->size +node->size);
|
816
|
+
map = (int*)(vec +n); /* get the secondary identifier map */
|
817
|
+
for (i = 0, cur = frst; cur; cur = cur->succ) {
|
818
|
+
vec[i] = cur; /* set the child node pointer, */
|
819
|
+
map[i++] = ID(cur); /* the identifier map entry, */
|
820
|
+
cur->parent = node; /* and the parent pointer */
|
821
|
+
} } /* in the new node */
|
822
|
+
else { /* if one identifier map is used */
|
823
|
+
map = node->cnts +(i = node->size);
|
824
|
+
vec = (ISNODE**)(map +i); /* get id. map and child vector */
|
825
|
+
while (--n >= 0) vec[n] = NULL;
|
826
|
+
for (cur = frst; cur; cur = cur->succ) {
|
827
|
+
vec[_bsearch(map, i, ID(cur))] = cur;
|
828
|
+
cur->parent = node; /* set the child node pointer */
|
829
|
+
} /* and the parent pointer */
|
830
|
+
} /* in the new node */
|
831
|
+
}
|
832
|
+
if (!ist->lvls[ist->height]) /* if no child has been added, */
|
833
|
+
return 1; /* abort the function, otherwise */
|
834
|
+
ist->height++; /* increment the level counter */
|
835
|
+
_checksub(ist->lvls[0]); /* check for unnecessary subtrees */
|
836
|
+
return 0; /* return 'ok' */
|
837
|
+
} /* ist_addlvl() */
|
838
|
+
|
839
|
+
/*--------------------------------------------------------------------*/
|
840
|
+
|
841
|
+
void ist_up (ISTREE *ist, int root)
|
842
|
+
{ /* --- go up in item set tree */
|
843
|
+
assert(ist && ist->curr); /* check the function argument */
|
844
|
+
if (root) /* if root flag set, */
|
845
|
+
ist->curr = ist->lvls[0]; /* go to the root node */
|
846
|
+
else if (ist->curr->parent) /* if it exists, go to the parent */
|
847
|
+
ist->curr = ist->curr->parent;
|
848
|
+
} /* ist_up() */
|
849
|
+
|
850
|
+
/*--------------------------------------------------------------------*/
|
851
|
+
|
852
|
+
int ist_down (ISTREE *ist, int item)
|
853
|
+
{ /* --- go down in item set tree */
|
854
|
+
ISNODE *node; /* the current node */
|
855
|
+
ISNODE **vec; /* child node vector of current node */
|
856
|
+
int *map, n; /* identifier map and its size */
|
857
|
+
int c; /* number of children */
|
858
|
+
|
859
|
+
assert(ist && ist->curr); /* check the function argument */
|
860
|
+
node = ist->curr; /* get the current node */
|
861
|
+
c = node->chcnt & ~F_SKIP; /* if there are no child nodes, */
|
862
|
+
if (c <= 0) return -1; /* abort the function */
|
863
|
+
if (node->offset >= 0) { /* if a pure vector is used */
|
864
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
865
|
+
item -= ID(vec[0]); /* compute index in child node vector */
|
866
|
+
if (item >= c) return -1; } /* and abort if there is no child */
|
867
|
+
else { /* if an identifier map is used */
|
868
|
+
map = node->cnts +(n = node->size);
|
869
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
870
|
+
if (c < n) /* if a secondary id. map exists, */
|
871
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
872
|
+
item = _bsearch(map, n, item);
|
873
|
+
} /* search for the proper index */
|
874
|
+
if ((item < 0) || !vec[item]) /* if the index is out of range */
|
875
|
+
return -1; /* or the child does not exist, abort */
|
876
|
+
ist->curr = vec[item]; /* otherwise go to the child node */
|
877
|
+
return 0; /* return 'ok' */
|
878
|
+
} /* ist_down() */
|
879
|
+
|
880
|
+
/*--------------------------------------------------------------------*/
|
881
|
+
|
882
|
+
int ist_next (ISTREE *ist, int item)
|
883
|
+
{ /* --- get next item with a counter */
|
884
|
+
int i; /* vector index */
|
885
|
+
ISNODE *node; /* the current node */
|
886
|
+
int *map, n; /* identifier map and its size */
|
887
|
+
|
888
|
+
assert(ist && ist->curr); /* check the function argument */
|
889
|
+
node = ist->curr; /* get the current node */
|
890
|
+
if (node->offset >= 0) { /* if a pure vector is used, */
|
891
|
+
if (item < node->offset) return node->offset;
|
892
|
+
if (item >= node->offset +node->size) return -1;
|
893
|
+
return item +1; } /* return the next item identifier */
|
894
|
+
else { /* if an identifier map is used */
|
895
|
+
map = node->cnts +(n = node->size);
|
896
|
+
if (item < map[0]) return map[0];
|
897
|
+
if (item >= map[n-1]) return -1;
|
898
|
+
i = _bsearch(map, n, item); /* try to find the item directly */
|
899
|
+
if (i >= 0) return map[i+1];/* and return the following one */
|
900
|
+
while ((--n >= 0) && (*map > item)) map++;
|
901
|
+
return (n >= 0) ? *map :-1; /* search iteratively for the next */
|
902
|
+
} /* item identifier and return it */
|
903
|
+
} /* ist_next() */
|
904
|
+
|
905
|
+
/*--------------------------------------------------------------------*/
|
906
|
+
|
907
|
+
void ist_setcnt (ISTREE *ist, int item, int cnt)
|
908
|
+
{ /* --- set counter for an item */
|
909
|
+
ISNODE *node; /* the current node */
|
910
|
+
ISNODE **vec; /* child node vector of current node */
|
911
|
+
int *map, n; /* identifier map and its size */
|
912
|
+
int c; /* number of children */
|
913
|
+
|
914
|
+
assert(ist && ist->curr); /* check the function argument */
|
915
|
+
node = ist->curr; /* get the current node */
|
916
|
+
if (node->offset >= 0) { /* if a pure vector is used, */
|
917
|
+
item -= node->offset; /* get index in counter vector */
|
918
|
+
if (item >= node->size) return; }
|
919
|
+
else { /* if an identifier map is used */
|
920
|
+
map = node->cnts +(n = node->size);
|
921
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
922
|
+
c = node->chcnt & ~F_SKIP; /* and the number of children */
|
923
|
+
if (c < n) /* if a secondary id. map exists, */
|
924
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
925
|
+
item = _bsearch(map, n, item);
|
926
|
+
} /* search for the proper index */
|
927
|
+
if (item >= 0) node->cnts[item] = cnt;
|
928
|
+
} /* ist_setcnt() */ /* set the frequency counter */
|
929
|
+
|
930
|
+
/*--------------------------------------------------------------------*/
|
931
|
+
|
932
|
+
int ist_getcnt (ISTREE *ist, int item)
|
933
|
+
{ /* --- get counter for an item */
|
934
|
+
ISNODE *node; /* the current node */
|
935
|
+
ISNODE **vec; /* child node vector of current node */
|
936
|
+
int *map, n; /* identifier map and its size */
|
937
|
+
int c; /* number of children */
|
938
|
+
|
939
|
+
assert(ist && ist->curr); /* check the function argument */
|
940
|
+
node = ist->curr; /* get the current node */
|
941
|
+
if (node->offset >= 0) { /* if pure vectors are used, */
|
942
|
+
item -= node->offset; /* get index in counter vector */
|
943
|
+
if (item >= node->size) return -1; }
|
944
|
+
else { /* if an identifier map is used */
|
945
|
+
map = node->cnts +(n = node->size);
|
946
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
947
|
+
c = node->chcnt & ~F_SKIP; /* and the number of children */
|
948
|
+
if (c < n) /* if a secondary id. map exists, */
|
949
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
950
|
+
item = _bsearch(map, n, item);
|
951
|
+
} /* search for the proper index */
|
952
|
+
if (item < 0) return -1; /* abort if index is out of range */
|
953
|
+
return node->cnts[item]; /* return the value of the counter */
|
954
|
+
} /* ist_getcnt() */
|
955
|
+
|
956
|
+
/*--------------------------------------------------------------------*/
|
957
|
+
|
958
|
+
int ist_getcntx (ISTREE *ist, int *set, int cnt)
|
959
|
+
{ /* --- get counter for an item set */
|
960
|
+
assert(ist /* check the function arguments */
|
961
|
+
&& (cnt >= 0) && (set || (cnt <= 0)));
|
962
|
+
if (cnt <= 0) /* if the item set is empty, */
|
963
|
+
return ist->tacnt; /* return the transaction count */
|
964
|
+
return COUNT(_getsupp(ist->lvls[0], set, cnt));
|
965
|
+
} /* ist_getcntx() */ /* return the item set support */
|
966
|
+
|
967
|
+
/*--------------------------------------------------------------------*/
|
968
|
+
|
969
|
+
void ist_filter (ISTREE *ist, int mode)
|
970
|
+
{ /* --- filter frequent item sets */
|
971
|
+
int i, k; /* loop variables */
|
972
|
+
ISNODE *node; /* to traverse the nodes */
|
973
|
+
int supp; /* support of an item set */
|
974
|
+
|
975
|
+
assert(ist); /* check the function argument */
|
976
|
+
if (mode == IST_CLEAR) { /* if to clear all skip flags */
|
977
|
+
for (k = 1; k < ist->height; k++)
|
978
|
+
for (node = ist->lvls[k]; node; node = node->succ)
|
979
|
+
for (i = 0; i < node->size; i++)
|
980
|
+
node->cnts[i] &= ~F_SKIP;
|
981
|
+
return; /* clear all skip flags */
|
982
|
+
} /* and abort the function */
|
983
|
+
supp = -1; /* set default support filter */
|
984
|
+
for (k = 1; k < ist->height; k++) {
|
985
|
+
for (node = ist->lvls[k]; node; node = node->succ) {
|
986
|
+
for (i = 0; i < node->size; i++) {
|
987
|
+
if (node->cnts[i] < ist->supp)
|
988
|
+
continue; /* skip infrequent item sets */
|
989
|
+
if (mode == IST_CLOSED) supp = node->cnts[i];
|
990
|
+
_marksub(ist, node, i, supp);
|
991
|
+
} /* mark all n-1 subsets */
|
992
|
+
} /* of the current item set */
|
993
|
+
} /* that have to be cleared/marked */
|
994
|
+
} /* ist_filter() */
|
995
|
+
|
996
|
+
/*--------------------------------------------------------------------*/
|
997
|
+
|
998
|
+
void ist_init (ISTREE *ist, int minlen, int arem, double minval)
|
999
|
+
{ /* --- initialize (rule) extraction */
|
1000
|
+
assert(ist /* check the function arguments */
|
1001
|
+
&& (minlen > 0) && (minval >= 0.0) && (minval <= 1.0));
|
1002
|
+
ist->item = ist->index = -1; /* initialize rule extraction */
|
1003
|
+
ist->node = ist->lvls[minlen -1];
|
1004
|
+
ist->size = minlen;
|
1005
|
+
ist->head = NULL;
|
1006
|
+
if ((arem < EM_NONE) || (arem >= EM_UNKNOWN))
|
1007
|
+
arem = EM_NONE; /* check, adapt, and note */
|
1008
|
+
ist->arem = arem; /* additional evaluation measure */
|
1009
|
+
ist->minval = minval; /* and its minimal value */
|
1010
|
+
} /* ist_init() */
|
1011
|
+
|
1012
|
+
/*--------------------------------------------------------------------*/
|
1013
|
+
|
1014
|
+
int ist_set (ISTREE *ist, int *set, int *supp, double *aval)
|
1015
|
+
{ /* --- extract next frequent item set */
|
1016
|
+
int i; /* loop variable */
|
1017
|
+
int item; /* an item identifier */
|
1018
|
+
ISNODE *node, *tmp; /* current item set node, buffer */
|
1019
|
+
int *cnts; /* to access the item frequencies */
|
1020
|
+
int s_set; /* support of the current set */
|
1021
|
+
double dev; /* deviation from indep. occurrence */
|
1022
|
+
|
1023
|
+
assert(ist && set && supp); /* check the function arguments */
|
1024
|
+
if (ist->size > ist->height) /* if the tree is not high enough */
|
1025
|
+
return -1; /* for the item set size, abort */
|
1026
|
+
|
1027
|
+
/* --- find frequent item set --- */
|
1028
|
+
node = ist->node; /* get the current item set node */
|
1029
|
+
while (1) { /* search for a frequent item set */
|
1030
|
+
if (++ist->index >= node->size) { /* if all subsets have been */
|
1031
|
+
node = node->succ; /* processed, go to the successor */
|
1032
|
+
if (!node) { /* if at the end of a level, go down */
|
1033
|
+
if (++ist->size > ist->height)
|
1034
|
+
return -1; /* if beyond the deepest level, abort */
|
1035
|
+
node = ist->lvls[ist->size -1];
|
1036
|
+
} /* get the 1st node of the new level */
|
1037
|
+
ist->node = node; /* note the new item set node */
|
1038
|
+
ist->index = 0; /* start with the first item set */
|
1039
|
+
} /* of the new item set node */
|
1040
|
+
if (node->offset >= 0) item = node->offset +ist->index;
|
1041
|
+
else item = node->cnts[node->size +ist->index];
|
1042
|
+
if (is_getapp(ist->set, item) == IST_IGNORE)
|
1043
|
+
continue; /* skip items to ignore */
|
1044
|
+
s_set = node->cnts[ist->index];
|
1045
|
+
if (s_set < ist->supp) /* if the support is not sufficient, */
|
1046
|
+
continue; /* go to the next item set */
|
1047
|
+
/* Note that this check automatically skips all item sets that */
|
1048
|
+
/* are marked with the flag F_SKIP, because s_set is negative */
|
1049
|
+
/* with this flag and thus necessarily smaller than ist->supp. */
|
1050
|
+
dev = 0; /* init. add. evaluation measure */
|
1051
|
+
if (ist->arem == EM_DIFF) { /* if logarithm of support quotient */
|
1052
|
+
cnts = ist->lvls[0]->cnts;
|
1053
|
+
dev = log(s_set) -log(COUNT(cnts[item]));
|
1054
|
+
for (tmp = node; tmp->parent; tmp = tmp->parent)
|
1055
|
+
dev -= log(COUNT(cnts[ID(tmp)]));
|
1056
|
+
dev = (dev +(ist->size-1) *log(ist->tacnt)) *(0.01/LN_2);
|
1057
|
+
if (dev < ist->minval) /* if the value of the additional */
|
1058
|
+
continue; /* eval. measure is not high enough, */
|
1059
|
+
} /* skip the item set */
|
1060
|
+
break; /* otherwise abort the search loop */
|
1061
|
+
}
|
1062
|
+
*supp = s_set; /* store the item set support and */
|
1063
|
+
if (aval) *aval = dev; /* the value of the add. measure */
|
1064
|
+
|
1065
|
+
/* --- build frequent item set --- */
|
1066
|
+
i = ist->size; /* get the current item set size */
|
1067
|
+
set[--i] = item; /* and store the first item */
|
1068
|
+
while (node->parent) { /* while not at the root node */
|
1069
|
+
set[--i] = ID(node); /* add item to the item set */
|
1070
|
+
node = node->parent; /* and go to the parent node */
|
1071
|
+
}
|
1072
|
+
return ist->size; /* return the item set size */
|
1073
|
+
} /* ist_set() */
|
1074
|
+
|
1075
|
+
/*--------------------------------------------------------------------*/
|
1076
|
+
|
1077
|
+
int ist_rule (ISTREE *ist, int *rule,
|
1078
|
+
int *supp, double *conf, double *lift, double *aval)
|
1079
|
+
{ /* --- extract next rule */
|
1080
|
+
int i; /* loop variable */
|
1081
|
+
int item; /* an item identifier */
|
1082
|
+
ISNODE *node; /* current item set node */
|
1083
|
+
ISNODE *parent; /* parent of the item set node */
|
1084
|
+
int *map, n; /* identifier map and its size */
|
1085
|
+
int s_set; /* support of set (body & head) */
|
1086
|
+
int s_body; /* support of body (antecedent) */
|
1087
|
+
int s_head; /* support of head (consequent) */
|
1088
|
+
double c, v; /* confidence and measure value */
|
1089
|
+
int app; /* appearance flag of head item */
|
1090
|
+
|
1091
|
+
assert(ist && rule && supp); /* check the function arguments */
|
1092
|
+
if (ist->size > ist->height) /* if the tree is not high enough */
|
1093
|
+
return -1; /* for the rule length, abort */
|
1094
|
+
|
1095
|
+
/* --- find rule --- */
|
1096
|
+
node = ist->node; /* get the current item set node */
|
1097
|
+
while (1) { /* search for a rule */
|
1098
|
+
if (ist->item >= 0) { /* --- select next item subset */
|
1099
|
+
*--ist->path = ist->item; /* add previous head to the path */
|
1100
|
+
ist->plen++; /* and get the next head item */
|
1101
|
+
ist->item = ID(ist->head);
|
1102
|
+
ist->head = ist->head->parent;
|
1103
|
+
if (!ist->head) /* if all subsets have been processed */
|
1104
|
+
ist->item = -1; /* clear the head item to trigger the */
|
1105
|
+
} /* selection of a new item set */
|
1106
|
+
if (ist->item < 0) { /* --- select next item set */
|
1107
|
+
if (++ist->index >= node->size){/* if all subsets have been */
|
1108
|
+
node = node->succ; /* processed, go to the successor */
|
1109
|
+
if (!node) { /* if at the end of a level, go down */
|
1110
|
+
if (++ist->size > ist->height)
|
1111
|
+
return -1; /* if beyond the deepest level, abort */
|
1112
|
+
node = ist->lvls[ist->size -1];
|
1113
|
+
} /* get the 1st node of the new level */
|
1114
|
+
ist->node = node; /* note the new item set node and */
|
1115
|
+
ist->index = 0; /* start with the first item set */
|
1116
|
+
} /* of the new item set node */
|
1117
|
+
if (node->offset >= 0) item = node->offset +ist->index;
|
1118
|
+
else item = node->cnts[node->size +ist->index];
|
1119
|
+
app = is_getapp(ist->set, item);
|
1120
|
+
if ((app == IST_IGNORE) || (HDONLY(node) && (app == IST_HEAD)))
|
1121
|
+
continue; /* skip sets with two head only items */
|
1122
|
+
ist->item = item; /* set the head item identifier */
|
1123
|
+
ist->hdonly = HDONLY(node) || (app == IST_HEAD);
|
1124
|
+
ist->head = node; /* set the new head item node */
|
1125
|
+
ist->path = ist->buf +ist->vsz;
|
1126
|
+
ist->plen = 0; /* clear the path */
|
1127
|
+
}
|
1128
|
+
app = is_getapp(ist->set, ist->item); /* get head item appearance */
|
1129
|
+
if (!(app & IST_HEAD) || (ist->hdonly && (app != IST_HEAD)))
|
1130
|
+
continue; /* if rule is not allowed, skip it */
|
1131
|
+
s_set = COUNT(node->cnts[ist->index]);
|
1132
|
+
if (s_set < ist->supp) { /* get and check the item set support */
|
1133
|
+
ist->item = -1; continue; }
|
1134
|
+
parent = node->parent; /* get the parent node */
|
1135
|
+
if (ist->plen > 0) /* if there is a path, use it */
|
1136
|
+
s_body = COUNT(_getsupp(ist->head, ist->path, ist->plen));
|
1137
|
+
else if (!parent) /* if there is no parent (root node), */
|
1138
|
+
s_body = ist->tacnt; /* get the number of transactions */
|
1139
|
+
else if (parent->offset >= 0) /* if a pure vector is used */
|
1140
|
+
s_body = COUNT(parent->cnts[ID(node) -parent->offset]);
|
1141
|
+
else { /* if an identifier map is used */
|
1142
|
+
map = parent->cnts +(n = parent->size);
|
1143
|
+
s_body = COUNT(parent->cnts[_bsearch(map, n, ID(node))]);
|
1144
|
+
} /* find vector index and get support */
|
1145
|
+
if (s_body < ist->rule) /* if the body support is too low, */
|
1146
|
+
continue; /* get the next subset/next set */
|
1147
|
+
c = s_set/(double)s_body; /* compute the rule confidence */
|
1148
|
+
if (c < ist->conf -EPSILON) /* if the confidence is too low, */
|
1149
|
+
continue; /* go to the next item (sub)set */
|
1150
|
+
s_head = COUNT(ist->lvls[0]->cnts[ist->item]);
|
1151
|
+
if (ist->arem == EM_NONE) { /* if no add. eval. measure given, */
|
1152
|
+
v = 0; break; } /* abort the loop (select the rule) */
|
1153
|
+
if (ist->size < 2) { /* if rule has an empty antecedent, */
|
1154
|
+
v = 0; break; } /* abort the loop (select the rule) */
|
1155
|
+
v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
|
1156
|
+
if (v >= ist->minval) /* if rule value exceeds the minimal */
|
1157
|
+
break; /* of the add. rule eval. measure, */
|
1158
|
+
} /* while (1) */ /* abort the loop (select rule) */
|
1159
|
+
*supp = (ist->mode & IST_HEAD) ? s_set : s_body;
|
1160
|
+
if (lift) /* compute and store the lift value */
|
1161
|
+
*lift = (c *ist->tacnt)/(double)s_head;
|
1162
|
+
if (conf) *conf = c; /* store the rule confidence and */
|
1163
|
+
if (aval) *aval = v; /* the value of the add. measure */
|
1164
|
+
|
1165
|
+
/* --- build rule --- */
|
1166
|
+
if (node->offset >= 0) item = node->offset +ist->index;
|
1167
|
+
else item = node->cnts[node->size +ist->index];
|
1168
|
+
i = ist->size; /* get the current item and */
|
1169
|
+
if (item != ist->item) /* if this item is not the head, */
|
1170
|
+
rule[--i] = item; /* add it to the rule body */
|
1171
|
+
while (node->parent) { /* traverse the path to the root */
|
1172
|
+
if (ID(node) != ist->item) /* and add all items on this */
|
1173
|
+
rule[--i] = ID(node); /* path to the rule body */
|
1174
|
+
node = node->parent; /* (except the head of the rule) */
|
1175
|
+
}
|
1176
|
+
rule[0] = ist->item; /* set the head of the rule, */
|
1177
|
+
return ist->size; /* return the rule size */
|
1178
|
+
} /* ist_rule() */
|
1179
|
+
|
1180
|
+
/*--------------------------------------------------------------------*/
|
1181
|
+
|
1182
|
+
int ist_hedge (ISTREE *ist, int *hedge,
|
1183
|
+
int *supp, double *conf, double *aval)
|
1184
|
+
{ /* --- extract next hyperedge */
|
1185
|
+
int i; /* loop variable */
|
1186
|
+
int item; /* an item identifier */
|
1187
|
+
ISNODE *node; /* current item set node */
|
1188
|
+
ISNODE *head; /* node containing the rule head */
|
1189
|
+
int *map, n; /* identifier map and its size */
|
1190
|
+
int *path, plen; /* path in tree and its length */
|
1191
|
+
int s_set; /* support of set (body & head) */
|
1192
|
+
int s_body; /* support of body (antecedent) */
|
1193
|
+
int s_head; /* support of head (consequent) */
|
1194
|
+
double c, t, v = 0; /* confidence and measure value */
|
1195
|
+
|
1196
|
+
assert(ist && hedge && supp); /* check the function arguments */
|
1197
|
+
if (ist->size > ist->height) /* if the tree is not high enough */
|
1198
|
+
return -1; /* for the hyperedge size, abort */
|
1199
|
+
|
1200
|
+
/* --- find hyperedge --- */
|
1201
|
+
node = ist->node; /* get the current item set node */
|
1202
|
+
while (1) { /* search for a hyperedge */
|
1203
|
+
if (++ist->index >= node->size) { /* if all subsets have been */
|
1204
|
+
node = node->succ; /* processed, go to the successor */
|
1205
|
+
if (!node) { /* if at the end of a level, go down */
|
1206
|
+
if (++ist->size > ist->height)
|
1207
|
+
return -1; /* if beyond the deepest level, abort */
|
1208
|
+
node = ist->lvls[ist->size -1];
|
1209
|
+
} /* get the 1st node of the new level */
|
1210
|
+
ist->node = node; /* note the new item set node and */
|
1211
|
+
ist->index = 0; /* start with the first item set */
|
1212
|
+
} /* of the new item set node */
|
1213
|
+
if (node->offset >= 0) item = node->offset +ist->index;
|
1214
|
+
else item = node->cnts[node->size +ist->index];
|
1215
|
+
if (is_getapp(ist->set, item) == IST_IGNORE)
|
1216
|
+
continue; /* skip items to ignore */
|
1217
|
+
s_set = COUNT(node->cnts[ist->index]);
|
1218
|
+
if (s_set < ist->supp) /* if the set support is too low, */
|
1219
|
+
continue; /* skip this item set */
|
1220
|
+
head = node->parent; /* get subset support from parent */
|
1221
|
+
if (!head) /* if there is no parent (root node), */
|
1222
|
+
s_body = ist->tacnt; /* get the total number of sets */
|
1223
|
+
else if (head->offset >= 0) /* if pure vectors are used */
|
1224
|
+
s_body = head->cnts[ID(node) -head->offset];
|
1225
|
+
else { /* if an identifier map is used */
|
1226
|
+
map = head->cnts +(n = head->size);
|
1227
|
+
s_body = head->cnts[_bsearch(map, n, ID(node))];
|
1228
|
+
} /* find index and get the support */
|
1229
|
+
if (s_body & F_SKIP) { /* check for a valid body */
|
1230
|
+
node->cnts[ist->index] |= F_SKIP; continue; }
|
1231
|
+
s_body = COUNT(s_body); /* get the support of body and head */
|
1232
|
+
s_head = COUNT(ist->lvls[0]->cnts[item]);
|
1233
|
+
c = s_set/(double)s_body; /* compute confidence and add. eval. */
|
1234
|
+
v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
|
1235
|
+
item = ID(node); /* note the next head item */
|
1236
|
+
plen = 1; /* and initialize the path */
|
1237
|
+
path = ist->buf +ist->vsz; /* (store first item) */
|
1238
|
+
*--path = ist->index +node->offset;
|
1239
|
+
while (head) { /* traverse the path up to root */
|
1240
|
+
s_body = _getsupp(head, path, plen);
|
1241
|
+
if (s_body & F_SKIP) break;
|
1242
|
+
s_body = COUNT(s_body); /* get the support of the body */
|
1243
|
+
*--path = item; plen++; /* store the previous head item */
|
1244
|
+
item = ID(head); /* in the path (extend path) */
|
1245
|
+
c += s_set/(double)s_body;/* sum the rule confidences */
|
1246
|
+
s_head = COUNT(ist->lvls[0]->cnts[item]);
|
1247
|
+
t = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
|
1248
|
+
if (t < v) v = t; /* compute the add. evaluation */
|
1249
|
+
head = head->parent; /* and go to the parent node */
|
1250
|
+
} /* (get the next rule head) */
|
1251
|
+
c /= ist->size; /* average the rule confidences */
|
1252
|
+
if (!head /* check for a complete traversal */
|
1253
|
+
&& (c > ist->conf -EPSILON)
|
1254
|
+
&& (v > fabs(ist->minval) -EPSILON))
|
1255
|
+
break; /* check whether hyperedge qualifies */
|
1256
|
+
if ((ist->minval < 0) && node->parent)
|
1257
|
+
node->cnts[ist->index] |= F_SKIP;
|
1258
|
+
} /* while (1) */ /* otherwise mark it as skipped */
|
1259
|
+
*supp = s_set; /* store the hyperedge support, */
|
1260
|
+
if (conf) *conf = c; /* the average confidence and */
|
1261
|
+
if (aval) *aval = v; /* the value of the add. measure */
|
1262
|
+
|
1263
|
+
/* --- build hyperedge --- */
|
1264
|
+
i = ist->size -1; /* store the first item */
|
1265
|
+
if (node->offset >= 0) hedge[i] = ist->index +node->offset;
|
1266
|
+
else hedge[i] = node->cnts[node->size +ist->index];
|
1267
|
+
while (node->parent) { /* while not at the root node */
|
1268
|
+
hedge[--i] = ID(node); /* add item to the hyperedge */
|
1269
|
+
node = node->parent; /* and go to the parent node */
|
1270
|
+
}
|
1271
|
+
return ist->size; /* return the hyperedge size */
|
1272
|
+
} /* ist_hedge() */
|
1273
|
+
|
1274
|
+
/*--------------------------------------------------------------------*/
|
1275
|
+
|
1276
|
+
int ist_group (ISTREE *ist, int *group, int *supp, double *aval)
|
1277
|
+
{ /* --- extract next group */
|
1278
|
+
int i; /* loop variable */
|
1279
|
+
int item; /* an item identifier */
|
1280
|
+
ISNODE *node; /* current item set node */
|
1281
|
+
ISNODE *head; /* node containing the rule head */
|
1282
|
+
int *map, n; /* identifier map and its size */
|
1283
|
+
int *path, plen; /* path in tree and its length */
|
1284
|
+
int s_set; /* support of set (body & head) */
|
1285
|
+
int s_body; /* support of body (antecedent) */
|
1286
|
+
int s_head; /* support of head (consequent) */
|
1287
|
+
double t, v = 0; /* additional measure value */
|
1288
|
+
|
1289
|
+
assert(ist && group && supp); /* check the function arguments */
|
1290
|
+
if (ist->item < 0) { /* if this is the first call */
|
1291
|
+
ist->size = ist->height; /* init. the extraction variables */
|
1292
|
+
ist->node = ist->lvls[ist->size -1]; ist->item = 0;
|
1293
|
+
}
|
1294
|
+
if (ist->size <= 1) /* if all groups are reported */
|
1295
|
+
return -1; /* for the hyperedge size, abort */
|
1296
|
+
|
1297
|
+
/* --- find next group --- */
|
1298
|
+
node = ist->node; /* get the current item set node */
|
1299
|
+
while (1) { /* search for a hyperedge */
|
1300
|
+
if (++ist->index >= node->size) { /* if all subsets have been */
|
1301
|
+
node = node->succ; /* processed, go to the successor */
|
1302
|
+
if (!node) { /* if at the end of a level, go down */
|
1303
|
+
if (--ist->size <= 1) /* if all groups are reported, */
|
1304
|
+
return -1; /* abort the extraction */
|
1305
|
+
node = ist->lvls[ist->size -1];
|
1306
|
+
} /* get the 1st node of the new level */
|
1307
|
+
ist->node = node; /* note the new item set node and */
|
1308
|
+
ist->index = 0; /* start with the first item set */
|
1309
|
+
} /* of the new item set node */
|
1310
|
+
if (node->offset >= 0) item = node->offset +ist->index;
|
1311
|
+
else item = node->cnts[node->size +ist->index];
|
1312
|
+
if (is_getapp(ist->set, item) == IST_IGNORE)
|
1313
|
+
continue; /* skip items to ignore */
|
1314
|
+
s_set = node->cnts[ist->index];
|
1315
|
+
if (s_set < ist->supp) { /* if the set support is too low */
|
1316
|
+
if (s_set & F_SKIP) _marksub(ist, node, ist->index, -1);
|
1317
|
+
continue; /* mark subsets if necessary */
|
1318
|
+
} /* and skip this item set */
|
1319
|
+
/* Note that this check automatically skips all item sets that */
|
1320
|
+
/* are marked with the flag F_SKIP, because s_set is negative */
|
1321
|
+
/* with this flag and thus necessarily smaller than ist->supp. */
|
1322
|
+
head = node->parent; /* get subset support from parent */
|
1323
|
+
if (!head) /* if there is no parent (root node), */
|
1324
|
+
s_body = ist->tacnt; /* get the total number of sets */
|
1325
|
+
else if (head->offset >= 0) /* if pure vectors are used */
|
1326
|
+
s_body = head->cnts[ID(node) -head->offset];
|
1327
|
+
else { /* if an identifier map is used */
|
1328
|
+
map = head->cnts +(n = head->size);
|
1329
|
+
s_body = head->cnts[_bsearch(map, n, ID(node))];
|
1330
|
+
} /* find index and get the support */
|
1331
|
+
s_body = COUNT(s_body); /* get the support of body and head */
|
1332
|
+
s_head = COUNT(ist->lvls[0]->cnts[item]);
|
1333
|
+
v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
|
1334
|
+
item = ID(node); /* note the next head item */
|
1335
|
+
plen = 1; /* and initialize the path */
|
1336
|
+
path = ist->buf +ist->vsz; /* (store first item) */
|
1337
|
+
*--path = ist->index +node->offset;
|
1338
|
+
while (head) { /* traverse the path up to root */
|
1339
|
+
s_body = COUNT(_getsupp(head, path, plen));
|
1340
|
+
*--path = item; plen++; /* store the previous head item */
|
1341
|
+
item = ID(head); /* in the path (extend path) */
|
1342
|
+
s_head = COUNT(ist->lvls[0]->cnts[item]);
|
1343
|
+
t = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
|
1344
|
+
if (t < v) v = t; /* compute the add. evaluation */
|
1345
|
+
head = head->parent; /* and go to the parent node */
|
1346
|
+
} /* (get the next rule head) */
|
1347
|
+
if (!head /* check for a complete traversal */
|
1348
|
+
&& (v > fabs(ist->minval) -EPSILON))
|
1349
|
+
break; /* check whether group qualifies */
|
1350
|
+
} /* while (1) */
|
1351
|
+
*supp = s_set; /* store the group support and */
|
1352
|
+
if (aval) *aval = v; /* the value of the add. measure */
|
1353
|
+
_marksub(ist, node, ist->index, -1);
|
1354
|
+
|
1355
|
+
/* --- build hyperedge --- */
|
1356
|
+
i = ist->size -1; /* store the first item */
|
1357
|
+
if (node->offset >= 0) group[i] = ist->index +node->offset;
|
1358
|
+
else group[i] = node->cnts[node->size +ist->index];
|
1359
|
+
while (node->parent) { /* while not at the root node */
|
1360
|
+
group[--i] = ID(node); /* add item to the hyperedge */
|
1361
|
+
node = node->parent; /* and go to the parent node */
|
1362
|
+
}
|
1363
|
+
return ist->size; /* return the hyperedge size */
|
1364
|
+
} /* ist_group() */
|
1365
|
+
|
1366
|
+
/*--------------------------------------------------------------------*/
|
1367
|
+
#ifndef NDEBUG
|
1368
|
+
|
1369
|
+
static void _showtree (ISNODE *node, int level)
|
1370
|
+
{ /* --- show subtree */
|
1371
|
+
int i, k; /* loop variables, buffer */
|
1372
|
+
int *map, n; /* identifier map and its size */
|
1373
|
+
int c; /* number of children */
|
1374
|
+
ISNODE **vec; /* vector of child nodes */
|
1375
|
+
|
1376
|
+
assert(node && (level >= 0)); /* check the function arguments */
|
1377
|
+
c = node->chcnt & ~F_SKIP; /* get the number of children */
|
1378
|
+
if (c <= 0) /* if there are no children, */
|
1379
|
+
vec = NULL; /* clear the child vector variable */
|
1380
|
+
else if (node->offset >= 0) /* if a pure vector is used */
|
1381
|
+
vec = (ISNODE**)(node->cnts +node->size);
|
1382
|
+
else { /* if an identifier map is used */
|
1383
|
+
map = node->cnts +(n = node->size);
|
1384
|
+
vec = (ISNODE**)(map +n); /* get id. map and child vector */
|
1385
|
+
if (c < n) /* if a secondary id. map exists, */
|
1386
|
+
map = (int*)(vec +(n = c)); /* get this identifier map */
|
1387
|
+
} /* get child access variables */
|
1388
|
+
for (i = 0; i < node->size; i++) {
|
1389
|
+
for (k = level; --k >= 0; ) /* indent and print */
|
1390
|
+
printf(" "); /* item identifier and counter */
|
1391
|
+
if (node->offset >= 0) k = node->offset +i;
|
1392
|
+
else k = node->cnts[node->size +i];
|
1393
|
+
printf("%d: %d\n", k, COUNT(node->cnts[i]));
|
1394
|
+
if (!vec) continue; /* check whether there are children */
|
1395
|
+
if (node->offset >= 0) k -= ID(vec[0]);
|
1396
|
+
else k = _bsearch(map, n, k);
|
1397
|
+
if ((k >= 0) && (k < c) && vec[k])
|
1398
|
+
_showtree(vec[k], level +1);
|
1399
|
+
} /* show subtree recursively */
|
1400
|
+
} /* _showtree() */
|
1401
|
+
|
1402
|
+
/*--------------------------------------------------------------------*/
|
1403
|
+
|
1404
|
+
void ist_show (ISTREE *ist)
|
1405
|
+
{ /* --- show an item set tree */
|
1406
|
+
assert(ist); /* check the function argument */
|
1407
|
+
_showtree(ist->lvls[0], 0); /* show nodes recursively */
|
1408
|
+
printf("total: %d\n", ist->tacnt);
|
1409
|
+
} /* ist_show() */ /* print number of transactions */
|
1410
|
+
|
1411
|
+
#endif
|