apriori-rails 0.2.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. data/History.txt +22 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +17 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +88 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +39 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +97 -0
  97. data/lib/apriori/version.rb +3 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +251 -0
  118. data/website/index.txt +154 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +267 -0
@@ -0,0 +1,120 @@
1
+ # Microsoft Developer Studio Project File - Name="apriori" - Package Owner=<4>
2
+ # Microsoft Developer Studio Generated Build File, Format Version 6.00
3
+ # ** NICHT BEARBEITEN **
4
+
5
+ # TARGTYPE "Win32 (x86) Console Application" 0x0103
6
+
7
+ CFG=apriori - Win32 Debug
8
+ !MESSAGE Dies ist kein g�ltiges Makefile. Zum Erstellen dieses Projekts mit NMAKE
9
+ !MESSAGE verwenden Sie den Befehl "Makefile exportieren" und f�hren Sie den Befehl
10
+ !MESSAGE
11
+ !MESSAGE NMAKE /f "apriori.mak".
12
+ !MESSAGE
13
+ !MESSAGE Sie k�nnen beim Ausf�hren von NMAKE eine Konfiguration angeben
14
+ !MESSAGE durch Definieren des Makros CFG in der Befehlszeile. Zum Beispiel:
15
+ !MESSAGE
16
+ !MESSAGE NMAKE /f "apriori.mak" CFG="apriori - Win32 Debug"
17
+ !MESSAGE
18
+ !MESSAGE F�r die Konfiguration stehen zur Auswahl:
19
+ !MESSAGE
20
+ !MESSAGE "apriori - Win32 Release" (basierend auf "Win32 (x86) Console Application")
21
+ !MESSAGE "apriori - Win32 Debug" (basierend auf "Win32 (x86) Console Application")
22
+ !MESSAGE
23
+
24
+ # Begin Project
25
+ # PROP AllowPerConfigDependencies 0
26
+ # PROP Scc_ProjName ""
27
+ # PROP Scc_LocalPath ""
28
+ CPP=cl.exe
29
+ RSC=rc.exe
30
+
31
+ !IF "$(CFG)" == "apriori - Win32 Release"
32
+
33
+ # PROP BASE Use_MFC 0
34
+ # PROP BASE Use_Debug_Libraries 0
35
+ # PROP BASE Output_Dir "Release"
36
+ # PROP BASE Intermediate_Dir "Release"
37
+ # PROP BASE Target_Dir ""
38
+ # PROP Use_MFC 0
39
+ # PROP Use_Debug_Libraries 0
40
+ # PROP Output_Dir "Release"
41
+ # PROP Intermediate_Dir "Release"
42
+ # PROP Target_Dir ""
43
+ # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
44
+ # ADD CPP /nologo /W3 /GX /O2 /I "..\..\util\src" /I "..\..\math\src" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /D "NIMAPFN" /YX /FD /c
45
+ # ADD BASE RSC /l 0x407 /d "NDEBUG"
46
+ # ADD RSC /l 0x407 /d "NDEBUG"
47
+ BSC32=bscmake.exe
48
+ # ADD BASE BSC32 /nologo
49
+ # ADD BSC32 /nologo
50
+ LINK32=link.exe
51
+ # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
52
+ # ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
53
+
54
+ !ELSEIF "$(CFG)" == "apriori - Win32 Debug"
55
+
56
+ # PROP BASE Use_MFC 0
57
+ # PROP BASE Use_Debug_Libraries 1
58
+ # PROP BASE Output_Dir "Debug"
59
+ # PROP BASE Intermediate_Dir "Debug"
60
+ # PROP BASE Target_Dir ""
61
+ # PROP Use_MFC 0
62
+ # PROP Use_Debug_Libraries 1
63
+ # PROP Output_Dir "Debug"
64
+ # PROP Intermediate_Dir "Debug"
65
+ # PROP Target_Dir ""
66
+ # ADD BASE CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
67
+ # ADD CPP /nologo /W3 /Gm /GX /ZI /Od /I "..\..\util\src" /I "..\..\math\src" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "NIMAPFN" /YX /FD /c
68
+ # ADD BASE RSC /l 0x407 /d "_DEBUG"
69
+ # ADD RSC /l 0x407 /d "_DEBUG"
70
+ BSC32=bscmake.exe
71
+ # ADD BASE BSC32 /nologo
72
+ # ADD BSC32 /nologo
73
+ LINK32=link.exe
74
+ # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
75
+ # ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
76
+
77
+ !ENDIF
78
+
79
+ # Begin Target
80
+
81
+ # Name "apriori - Win32 Release"
82
+ # Name "apriori - Win32 Debug"
83
+ # Begin Source File
84
+
85
+ SOURCE=.\apriori.c
86
+ # End Source File
87
+ # Begin Source File
88
+
89
+ SOURCE=.\tract.c
90
+ # End Source File
91
+ # Begin Source File
92
+
93
+ SOURCE=.\istree.c
94
+ # End Source File
95
+ # Begin Source File
96
+
97
+ SOURCE=..\..\util\src\scan.c
98
+ # End Source File
99
+ # Begin Source File
100
+
101
+ SOURCE=..\..\util\src\symtab.c
102
+ # End Source File
103
+ # Begin Source File
104
+
105
+ SOURCE=..\..\util\src\tabscan.c
106
+ # End Source File
107
+ # Begin Source File
108
+
109
+ SOURCE=..\..\util\src\vecops.c
110
+ # End Source File
111
+ # Begin Source File
112
+
113
+ SOURCE=..\..\math\src\gamma.c
114
+ # End Source File
115
+ # Begin Source File
116
+
117
+ SOURCE=..\..\math\src\chi2.c
118
+ # End Source File
119
+ # End Target
120
+ # End Project
@@ -0,0 +1,29 @@
1
+ Microsoft Developer Studio Workspace File, Format Version 6.00
2
+ # WARNUNG: DIESE ARBEITSBEREICHSDATEI DARF NICHT BEARBEITET ODER GEL�SCHT WERDEN!
3
+
4
+ ###############################################################################
5
+
6
+ Project: "apriori"=.\apriori.dsp - Package Owner=<4>
7
+
8
+ Package=<5>
9
+ {{{
10
+ }}}
11
+
12
+ Package=<4>
13
+ {{{
14
+ }}}
15
+
16
+ ###############################################################################
17
+
18
+ Global:
19
+
20
+ Package=<5>
21
+ {{{
22
+ }}}
23
+
24
+ Package=<3>
25
+ {{{
26
+ }}}
27
+
28
+ ###############################################################################
29
+
@@ -0,0 +1,99 @@
1
+ #-----------------------------------------------------------------------
2
+ # File : apriori.mak
3
+ # Contents: build apriori program
4
+ # Author : Christian Borgelt
5
+ # History : 26.01.2003 file created
6
+ # 20.07.2006 adapted to Visual Studio 8
7
+ #-----------------------------------------------------------------------
8
+ CC = cl.exe
9
+ LD = link.exe
10
+ DEFS = /D WIN32 /D NDEBUG /D _CONSOLE /D _MBCS \
11
+ /D _CRT_SECURE_NO_DEPRECATE
12
+ CFLAGS = /nologo /W3 /EHsc /O2 /I $(UTILDIR) /I $(MATHDIR) $(DEFS) /FD /c
13
+ LDFLAGS = /nologo /subsystem:console /incremental:no /machine:X86
14
+
15
+ THISDIR = ..\..\apriori\src
16
+ UTILDIR = ..\..\util\src
17
+ MATHDIR = ..\..\math\src
18
+ HDRS = $(UTILDIR)\vecops.h $(UTILDIR)\symtab.h \
19
+ $(UTILDIR)\tabscan.h $(UTILDIR)\scan.h \
20
+ $(MATHDIR)\gamma.h $(MATHDIR)\chi2.o \
21
+ tract.h istree.h
22
+ OBJS = $(UTILDIR)\vecops.obj $(UTILDIR)\nimap.obj \
23
+ $(UTILDIR)\tabscan.obj $(UTILDIR)\scan.obj \
24
+ $(MATHDIR)\gamma.obj $(MATHDIR)\chi2.obj \
25
+ tract.obj istree.obj apriori.obj
26
+
27
+ #-----------------------------------------------------------------------
28
+ # Build Program
29
+ #-----------------------------------------------------------------------
30
+ all: apriori.exe
31
+
32
+ apriori.exe: $(OBJS)
33
+ $(LD) $(LDFLAGS) $(OBJS) $(LIBS) /out:$@
34
+
35
+ #-----------------------------------------------------------------------
36
+ # Item and Transaction Management
37
+ #-----------------------------------------------------------------------
38
+ tract.obj: $(UTILDIR)\symtab.h tract.h tract.c apriori.mak
39
+ $(CC) $(CFLAGS) tract.c /Fo$@
40
+
41
+ #-----------------------------------------------------------------------
42
+ # Frequent Item Set Tree Management
43
+ #-----------------------------------------------------------------------
44
+ istree.obj: $(MATHDIR)\gamma.h tract.h istree.h istree.c apriori.mak
45
+ $(CC) $(CFLAGS) istree.c /Fo$@
46
+
47
+ #-----------------------------------------------------------------------
48
+ # Main Program
49
+ #-----------------------------------------------------------------------
50
+ apriori.obj: $(UTILDIR)\symtab.h tract.h istree.h apriori.c apriori.mak
51
+ $(CC) $(CFLAGS) /D NIMAPFN apriori.c /Fo$@
52
+
53
+ #-----------------------------------------------------------------------
54
+ # External Modules
55
+ #-----------------------------------------------------------------------
56
+ $(UTILDIR)\vecops.obj:
57
+ cd $(UTILDIR)
58
+ $(MAKE) /f util.mak vecops.obj
59
+ cd $(THISDIR)
60
+ $(UTILDIR)\nimap.obj:
61
+ cd $(UTILDIR)
62
+ $(MAKE) /f util.mak nimap.obj
63
+ cd $(THISDIR)
64
+ $(UTILDIR)\tabscan.obj:
65
+ cd $(UTILDIR)
66
+ $(MAKE) /f util.mak tabscan.obj
67
+ cd $(THISDIR)
68
+ $(UTILDIR)\scan.obj:
69
+ cd $(UTILDIR)
70
+ $(MAKE) /f util.mak scan.obj
71
+ cd $(THISDIR)
72
+ $(MATHDIR)\gamma.obj:
73
+ cd $(MATHDIR)
74
+ $(MAKE) /f math.mak gamma.obj
75
+ cd $(THISDIR)
76
+ $(MATHDIR)\chi2.obj:
77
+ cd $(MATHDIR)
78
+ $(MAKE) /f math.mak chi2.obj
79
+ cd $(THISDIR)
80
+
81
+ #-----------------------------------------------------------------------
82
+ # Install
83
+ #-----------------------------------------------------------------------
84
+ install:
85
+ -@copy apriori.exe c:\home\bin
86
+
87
+ #-----------------------------------------------------------------------
88
+ # Clean up
89
+ #-----------------------------------------------------------------------
90
+ clean:
91
+ $(MAKE) /f apriori.mak localclean
92
+ cd $(UTILDIR)
93
+ $(MAKE) /f util.mak clean
94
+ cd $(MATHDIR)
95
+ $(MAKE) /f math.mak clean
96
+ cd $(THISDIR)
97
+
98
+ localclean:
99
+ -@erase /Q *~ *.obj *.idb *.pch apriori.exe
@@ -0,0 +1,1411 @@
1
+ /*----------------------------------------------------------------------
2
+ File : istree.c
3
+ Contents: item set tree management
4
+ Author : Christian Borgelt
5
+ History : 1996.01.22 file created
6
+ 1996.02.07 _child, _count, ist_addlvl, and ist_count
7
+ 1996.02.09 ist_rule programmed and debugged
8
+ 1996.02.10 empty rule bodies made optional
9
+ 1996.03.28 support made relative to number of item sets
10
+ 1996.06.25 function _count optimized
11
+ 1996.11.23 rule extraction redesigned
12
+ 1996.11.24 rule selection criteria added
13
+ 1997.08.18 normalized chi^2 measure added
14
+ parameter minlen added to function ist_init()
15
+ 1998.01.15 confidence comparison changed to >=
16
+ 1998.01.23 integer support computation changed (ceil)
17
+ 1998.01.26 condition added to set extension in _child
18
+ 1998.02.10 bug in computation of EM_INFO fixed
19
+ 1998.02.11 parameter 'minval' added to function ist_init()
20
+ 1998.05.14 item set tree navigation functions added
21
+ 1998.08.08 item appearances considered for rule selection
22
+ 1998.08.20 deferred child node vector allocation added
23
+ 1998.09.02 several assertions added
24
+ 1998.09.05 bug concerning node id fixed
25
+ 1998.09.07 function ist_hedge added
26
+ 1998.09.22 bug in rule extraction (item appearances) fixed
27
+ 1998.09.23 computation of chi^2 measure simplified
28
+ 1999.02.05 long int changed to int
29
+ 1999.08.25 rule extraction simplified
30
+ 1999.11.05 rule evaluation measure EM_AIMP added
31
+ 1999.11.08 parameter 'aval' added to function ist_rule
32
+ 1999.11.11 rule consequents moved to first field
33
+ 1999.12.01 bug in node reallocation fixed
34
+ 2001.04.01 functions ist_set and ist_getcntx added,
35
+ functions _count and _getsupp improved
36
+ 2001.12.28 sort function moved to module tract
37
+ 2002.02.07 tree clearing removed, counting improved
38
+ 2002.02.08 child creation improved (check of body support)
39
+ 2002.02.10 IST_IGNORE bugs fixed (ist_set and ist_hedge)
40
+ 2002.02.11 memory usage minimization option added
41
+ 2002.02.12 ist_first and ist_last replaced by ist_next
42
+ 2002.02.19 transaction tree functions added
43
+ 2002.10.09 bug in function ist_hedge fixed (conf. comp.)
44
+ 2003.03.12 parameter lift added to function ist_rule
45
+ 2003.07.17 check of item usage added (function ist_check)
46
+ 2003.07.18 maximally frequent item set filter added
47
+ 2003.08.11 item set filtering generalized (ist_filter)
48
+ 2003.08.15 renamed new to cur in ist_addlvl (C++ compat.)
49
+ 2003.11.14 definition of F_HDONLY changed to INT_MIN
50
+ 2003.12.02 skipping unnecessary subtrees added (_checksub)
51
+ 2003.12.03 bug in ist_check for rule mining fixed
52
+ 2003.12.12 padding for 64 bit architecture added
53
+ 2004.05.09 additional selection measure for sets added
54
+ 2004.12.09 bug in add. evaluation measure for sets fixed
55
+ 2006.11.26 support parameter changed to an absolute value
56
+ 2007.02.07 bug in function ist_addlvl / _child fixed
57
+ 2008.01.25 bug in filtering closed/maximal item sets fixed
58
+ 2008.03.13 additional rule evaluation redesigned
59
+ 2008.03.24 creation based on ITEMSET structure
60
+ ----------------------------------------------------------------------*/
61
+ #include <stdio.h>
62
+ #include <stdlib.h>
63
+ #include <string.h>
64
+ #include <limits.h>
65
+ #include <float.h>
66
+ #include <math.h>
67
+ #include <assert.h>
68
+ #include "istree.h"
69
+ #include "chi2.h"
70
+ #ifdef STORAGE
71
+ #include "storage.h"
72
+ #endif
73
+
74
+ /*----------------------------------------------------------------------
75
+ Preprocessor Definitions
76
+ ----------------------------------------------------------------------*/
77
+ #define LN_2 0.69314718055994530942 /* ln(2) */
78
+ #define EPSILON 1e-12 /* to cope with roundoff errors */
79
+ #define BLKSIZE 32 /* block size for level vector */
80
+ #define F_HDONLY INT_MIN /* flag for head only item in path */
81
+ #define F_SKIP INT_MIN /* flag for subtree skipping */
82
+ #define ID(n) ((int)((n)->id & ~F_HDONLY))
83
+ #define HDONLY(n) ((int)((n)->id & F_HDONLY))
84
+ #define COUNT(n) ((n) & ~F_SKIP)
85
+
86
+ /*----------------------------------------------------------------------
87
+ Type Definitions
88
+ ----------------------------------------------------------------------*/
89
+ typedef double EVALFN (int set, int body, int head, int n);
90
+ /* function to compute an additional evaluation measure */
91
+
92
+ /*----------------------------------------------------------------------
93
+ Auxiliary Functions
94
+ ----------------------------------------------------------------------*/
95
+
96
+ static int _bsearch (int *vec, int n, int id)
97
+ { /* --- binary search for an item */
98
+ int i, k; /* left and middle index */
99
+
100
+ assert(vec && (n > 0)); /* check the function arguments */
101
+ for (i = 0; i < n; ) { /* while the range is not empty */
102
+ k = (i + n) >> 1; /* get index of middle element */
103
+ if (vec[k] > id) n = k;
104
+ else if (vec[k] < id) i = k+1;
105
+ else return k; /* adapt range boundaries or return */
106
+ } /* the index the id. was found at */
107
+ return -1; /* return 'not found' */
108
+ } /* _bsearch() */
109
+
110
+ /*--------------------------------------------------------------------*/
111
+
112
+ static void _count (ISNODE *node, int *set, int cnt, int min)
113
+ { /* --- count transaction recursively */
114
+ int i; /* vector index */
115
+ int *map, n; /* identifier map and its size */
116
+ ISNODE **vec; /* child node vector */
117
+
118
+ assert(node /* check the function arguments */
119
+ && (cnt >= 0) && (set || (cnt <= 0)));
120
+ if (node->offset >= 0) { /* if a pure vector is used */
121
+ if (node->chcnt == 0) { /* if this is a new node */
122
+ n = node->offset; /* get the index offset */
123
+ while ((cnt > 0) && (*set < n)) {
124
+ cnt--; set++; } /* skip items before first counter */
125
+ while (--cnt >= 0) { /* traverse the transaction's items */
126
+ i = *set++ -n; /* compute counter vector index */
127
+ if (i >= node->size) return;
128
+ node->cnts[i]++; /* if the counter exists, */
129
+ } } /* count the transaction */
130
+ else if (node->chcnt > 0) { /* if there are child nodes */
131
+ vec = (ISNODE**)(node->cnts +node->size);
132
+ n = ID(vec[0]); /* get the child node vector */
133
+ min--; /* one item less to the deepest nodes */
134
+ while ((cnt > min) && (*set < n)) {
135
+ cnt--; set++; } /* skip items before first child */
136
+ while (--cnt >= min) { /* traverse the transaction's items */
137
+ i = *set++ -n; /* compute child vector index */
138
+ if (i >= node->chcnt) return;
139
+ if (vec[i]) _count(vec[i], set, cnt, min);
140
+ } /* if the child exists, */
141
+ } } /* count the transaction recursively */
142
+ else { /* if an identifer map is used */
143
+ map = node->cnts +(n = node->size);
144
+ if (node->chcnt == 0) { /* if this is a new node */
145
+ while (--cnt >= 0) { /* traverse the transaction's items */
146
+ if (*set > map[n-1]) return; /* if beyond last item, abort */
147
+ i = _bsearch(map, n, *set++);
148
+ if (i >= 0) node->cnts[i]++;
149
+ } } /* find index and count transaction */
150
+ else if (node->chcnt > 0) { /* if there are child nodes */
151
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
152
+ if (node->chcnt < n) /* if a secondary id. map exists */
153
+ map = (int*)(vec +(n = node->chcnt));
154
+ min--; /* one item less to the deepest nodes */
155
+ while (--cnt >= min) { /* traverse the transaction's items */
156
+ if (*set > map[n-1]) return; /* if beyond last item, abort */
157
+ i = _bsearch(map, n, *set++);
158
+ if ((i >= 0) && vec[i]) _count(vec[i], set, cnt, min);
159
+ } /* search for the proper index */
160
+ } /* and if the child exists, */
161
+ } /* count the transaction recursively */
162
+ } /* _count() */
163
+
164
+ /*--------------------------------------------------------------------*/
165
+
166
+ static void _countx (ISNODE *node, TATREE *tat, int min)
167
+ { /* --- count transa. tree recursively */
168
+ int i, k; /* vector index, loop variable */
169
+ int *map, n; /* identifier map and its size */
170
+ ISNODE **vec; /* child node vector */
171
+
172
+ assert(node && tat); /* check the function arguments */
173
+ if (tat_max(tat) < min) /* if the transactions are too short, */
174
+ return; /* abort the recursion */
175
+ k = tat_size(tat); /* get the number of children */
176
+ if (k <= 0) { /* if there are no children */
177
+ if (k < 0) _count(node, tat_items(tat), -k, min);
178
+ return; /* count the normal transaction */
179
+ } /* and abort the function */
180
+ while (--k >= 0) /* count the transactions recursively */
181
+ _countx(node, tat_child(tat, k), min);
182
+ if (node->offset >= 0) { /* if a pure vector is used */
183
+ if (node->chcnt == 0) { /* if this is a new node */
184
+ n = node->offset; /* get the index offset */
185
+ for (k = tat_size(tat); --k >= 0; ) {
186
+ i = tat_item(tat,k) -n; /* traverse the items */
187
+ if (i < 0) return; /* if before first item, abort */
188
+ if (i < node->size) /* if inside the counter range */
189
+ node->cnts[i] += tat_cnt(tat_child(tat, k));
190
+ } } /* count the transaction */
191
+ else if (node->chcnt > 0) { /* if there are child nodes */
192
+ vec = (ISNODE**)(node->cnts +node->size);
193
+ n = ID(vec[0]); /* get the child node vector */
194
+ min--; /* one item less to the deepest nodes */
195
+ for (k = tat_size(tat); --k >= 0; ) {
196
+ i = tat_item(tat,k) -n; /* traverse the items */
197
+ if (i < 0) return; /* if before first item, abort */
198
+ if ((i < node->chcnt) && vec[i])
199
+ _countx(vec[i], tat_child(tat, k), min);
200
+ } /* if the child exists, */
201
+ } } /* count the transaction recursively */
202
+ else { /* if an identifer map is used */
203
+ map = node->cnts +(n = node->size);
204
+ if (node->chcnt == 0) { /* if this is a new node */
205
+ for (k = tat_size(tat); --k >= 0; ) {
206
+ i = tat_item(tat, k); /* get the next item */
207
+ if (i < map[0]) return; /* if before first item, abort */
208
+ i = _bsearch(map, n, i);
209
+ if (i >= 0) node->cnts[i] += tat_cnt(tat_child(tat, k));
210
+ } } /* find index and count transaction */
211
+ else if (node->chcnt > 0) { /* if there are child nodes */
212
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
213
+ if (node->chcnt < n) /* if a secondary id. map exists */
214
+ map = (int*)(vec +(n = node->chcnt));
215
+ min--; /* one item less to the deepest nodes */
216
+ for (k = tat_size(tat); --k >= 0; ) {
217
+ i = tat_item(tat, k); /* get the next item */
218
+ if (i < map[0]) return; /* if before first item, abort */
219
+ i = _bsearch(map, n, i);
220
+ if ((i >= 0) && vec[i]) _countx(vec[i], tat_child(tat, k), min);
221
+ } /* search for the proper index */
222
+ } /* and if the child exists, */
223
+ } /* count the transaction recursively */
224
+ } /* _countx() */
225
+
226
+ /*--------------------------------------------------------------------*/
227
+
228
+ static int _checksub (ISNODE *node)
229
+ { /* --- recursively check subtrees */
230
+ int i, r; /* vector index, result */
231
+ ISNODE **vec; /* child node vector */
232
+
233
+ assert(node); /* check the function argument */
234
+ if (node->chcnt == 0) return 0; /* do not skip new leaves */
235
+ if (node->chcnt < 0) return -1; /* skip marked subtrees */
236
+ if (node->offset >= 0) /* if a pure vector is used */
237
+ vec = (ISNODE**)(node->cnts +node->size);
238
+ else /* if an identifer map is used */
239
+ vec = (ISNODE**)(node->cnts +node->size +node->size);
240
+ for (r = -1, i = node->chcnt; --i >= 0; )
241
+ if (vec[i]) r &= _checksub(vec[i]);
242
+ if (!r) return 0; /* recursively check all children */
243
+ node->chcnt |= F_SKIP; /* set the skip flag if possible */
244
+ return -1; /* return 'subtree can be skipped' */
245
+ } /* _checksub() */
246
+
247
+ /*--------------------------------------------------------------------*/
248
+
249
+ static int _checkuse (ISNODE *node, char *marks, int supp)
250
+ { /* --- recursively check item usage */
251
+ int i, r = 0; /* vector index, result of check */
252
+ int *map, n; /* identifier map and its size */
253
+ ISNODE **vec; /* child node vector */
254
+
255
+ assert(node && marks); /* check the function arguments */
256
+ if (node->offset >= 0) { /* if a pure vector is used */
257
+ if (node->chcnt == 0) { /* if this is a new node */
258
+ n = node->offset; /* get the index offset */
259
+ for (i = node->size; --i >= 0; ) {
260
+ if (node->cnts[i] >= supp)
261
+ marks[n+i] = r = 1; /* mark items in set that satisfies */
262
+ } } /* the minimum support criterion */
263
+ else if (node->chcnt > 0) { /* if there are child nodes */
264
+ vec = (ISNODE**)(node->cnts +node->size);
265
+ for (i = node->chcnt; --i >= 0; )
266
+ if (vec[i]) r |= _checkuse(vec[i], marks, supp);
267
+ } } /* recursively process all children */
268
+ else { /* if an identifer map is used */
269
+ map = node->cnts +node->size;
270
+ if (node->chcnt == 0) { /* if this is a new node */
271
+ for (i = node->size; --i >= 0; ) {
272
+ if (node->cnts[i] >= supp)
273
+ marks[map[i]] = r = 1;/* mark items in set that satisfies */
274
+ } } /* the minimum support criterion */
275
+ else if (node->chcnt > 0) { /* if there are child nodes */
276
+ vec = (ISNODE**)(map +node->size);
277
+ for (i = node->chcnt; --i >= 0; )
278
+ if (vec[i]) r |= _checkuse(vec[i], marks, supp);
279
+ } /* get the child vector and */
280
+ } /* recursively process all children */
281
+ if ((r != 0) && node->parent) /* if the check succeeded, mark */
282
+ marks[ID(node)] = 1; /* the item associated with the node */
283
+ return r; /* return the check result */
284
+ } /* _checkuse() */
285
+
286
+ /*--------------------------------------------------------------------*/
287
+
288
+ static int _getsupp (ISNODE *node, int *set, int cnt)
289
+ { /* --- get support of an item set */
290
+ int i, n, c; /* vector index, buffers */
291
+ int *map; /* identifier map */
292
+ ISNODE **vec; /* vector of child nodes */
293
+
294
+ assert(node && set && (cnt >= 0)); /* check the function arguments */
295
+ while (--cnt > 0) { /* follow the set/path from the node */
296
+ c = node->chcnt & ~F_SKIP; /* if there are no children, */
297
+ if (c <= 0) return -1; /* the support is less than minsupp */
298
+ if (node->offset >= 0) { /* if a pure vector is used */
299
+ vec = (ISNODE**)(node->cnts +node->size);
300
+ i = *set++ -ID(vec[0]); /* compute the child vector index and */
301
+ if (i >= c) return -1; } /* abort if the child does not exist */
302
+ else { /* if an identifier map is used */
303
+ map = node->cnts +(n = node->size);
304
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
305
+ if (c < n) /* if a secondary id. map exists, */
306
+ map = (int*)(vec +(n = c)); /* get this identifier map */
307
+ i = _bsearch(map, n, *set++);
308
+ } /* search for the proper index */
309
+ if (i < 0) return -1; /* abort if index is out of range */
310
+ node = vec[i]; /* go to the corresponding child */
311
+ if (!node) return -1; /* if the child does not exists, */
312
+ } /* the support is less than minsupp */
313
+ if (node->offset >= 0) { /* if a pure vector is used, */
314
+ i = *set -node->offset; /* compute the counter index */
315
+ if (i >= node->size) return -1; }
316
+ else { /* if an identifier map is used */
317
+ map = node->cnts +(n = node->size);
318
+ i = _bsearch(map, n, *set);
319
+ } /* search for the proper index */
320
+ if (i < 0) return -1; /* abort if index is out of range */
321
+ return node->cnts[i]; /* return the item set support */
322
+ } /* _getsupp() */
323
+
324
+ /*--------------------------------------------------------------------*/
325
+
326
+ static void _marksupp (ISNODE *node, int *set, int cnt, int supp)
327
+ { /* --- mark support of an item set */
328
+ int i, n, c; /* vector index, buffers */
329
+ int *map; /* identifier map */
330
+ ISNODE **vec; /* vector of child nodes */
331
+
332
+ assert(node && set && (cnt >= 0)); /* check the function arguments */
333
+ while (--cnt > 0) { /* follow the set/path from the node */
334
+ if (node->offset >= 0) { /* if a pure vector is used */
335
+ vec = (ISNODE**)(node->cnts +node->size);
336
+ i = *set++ -ID(vec[0]);}/* compute the child vector index */
337
+ else { /* if an identifier map is used */
338
+ map = node->cnts +(n = node->size);
339
+ vec = (ISNODE**)(map +n); /* get id. map, child vector and */
340
+ c = node->chcnt & ~F_SKIP; /* the number of children */
341
+ if (c < n) /* if a secondary id. map exists, */
342
+ map = (int*)(vec +(n = c)); /* get this identifier map */
343
+ i = _bsearch(map, n, *set++);
344
+ } /* search for the proper index */
345
+ node = vec[i]; /* go to the corresponding child */
346
+ }
347
+ if (node->offset >= 0) /* if a pure vector is used, */
348
+ i = *set -node->offset; /* compute the counter index */
349
+ else { /* if an identifier map is used */
350
+ map = node->cnts +(n = node->size);
351
+ i = _bsearch(map, n, *set);
352
+ } /* search for the proper index */
353
+ if ((supp < 0) /* if to clear unconditionally */
354
+ || (node->cnts[i] == supp)) /* or the support is the same */
355
+ node->cnts[i] |= F_SKIP; /* mark support as cleared */
356
+ } /* _marksupp() */
357
+
358
+ /*--------------------------------------------------------------------*/
359
+
360
+ static void _marksub (ISTREE *ist, ISNODE *node, int index, int supp)
361
+ { /* --- mark all n-1 subsets */
362
+ int i; /* next item, loop variable */
363
+ int *set; /* (partial) item set */
364
+
365
+ if (node->offset >= 0) i = node->offset +index;
366
+ else i = node->cnts[node->size +index];
367
+ set = ist->buf +ist->vsz; /* get and store the first two items */
368
+ *--set = i; _marksupp(node->parent, set, 1, supp);
369
+ *--set = ID(node); _marksupp(node->parent, set, 1, supp);
370
+ i = 2; /* mark counters in parent node */
371
+ for (node = node->parent; node->parent; node = node->parent) {
372
+ _marksupp(node->parent, set, i, supp);
373
+ *--set = ID(node); i++; /* climb up the tree and mark */
374
+ } /* counters for all n-1 subsets */
375
+ } /* _marksub() */
376
+
377
+ /*--------------------------------------------------------------------*/
378
+
379
+ static ISNODE* _child (ISTREE *ist, ISNODE *node, int index,
380
+ int s_min, int s_body)
381
+ { /* --- create child node (extend set) */
382
+ int i, k, n; /* loop variables, counters */
383
+ ISNODE *curr; /* to traverse the path to the root */
384
+ int item, cnt; /* item identifier, number of items */
385
+ int *set; /* next (partial) item set to check */
386
+ int body; /* enough support for a rule body */
387
+ int hdonly; /* whether head only item on path */
388
+ int app; /* appearance flags of an item */
389
+ int s_set; /* support of an item set */
390
+
391
+ assert(ist && node /* check the function arguments */
392
+ && (index >= 0) && (index < node->size));
393
+ if (node->offset >= 0) item = node->offset +index;
394
+ else item = node->cnts[node->size +index];
395
+ app = is_getapp(ist->set, item); /* get item id. and app. flag */
396
+ if ((app == IST_IGNORE) /* do not extend an item to ignore */
397
+ || ((HDONLY(node) && (app == IST_HEAD))))
398
+ return NULL; /* nor a set with two head only items */
399
+ hdonly = HDONLY(node) || (app == IST_HEAD);
400
+
401
+ /* --- initialize --- */
402
+ s_set = node->cnts[index]; /* get support of item set to extend */
403
+ if (s_set < s_min) /* if set support is insufficient, */
404
+ return NULL; /* no child is needed, so abort */
405
+ body = (s_set >= s_body) /* if the set has enough support for */
406
+ ? 1 : 0; /* a rule body, set the body flag */
407
+ ist->buf[ist->vsz -2] = item; /* init. set for support checks */
408
+
409
+ /* --- check candidates --- */
410
+ for (n = 0, i = index; ++i < node->size; ) {
411
+ if (node->offset >= 0) k = node->offset +i;
412
+ else k = node->cnts[node->size +i];
413
+ app = is_getapp(ist->set, k); /* traverse the candidate items */
414
+ if ((app == IST_IGNORE) || (hdonly && (app == IST_HEAD)))
415
+ continue; /* skip sets with two head only items */
416
+ s_set = node->cnts[i]; /* traverse the candidate items */
417
+ if (s_set < s_min) /* if set support is insufficient, */
418
+ continue; /* ignore the corresponding candidate */
419
+ body &= 1; /* restrict body flags to the set S */
420
+ if (s_set >= s_body) /* if set support is sufficient for */
421
+ body |= 2; /* a rule body, set the body flag */
422
+ set = ist->buf +ist->vsz -(cnt = 2);
423
+ set[1] = k; /* add the candidate item to the set */
424
+ for (curr = node; curr->parent; curr = curr->parent) {
425
+ s_set = _getsupp(curr->parent, set, cnt);
426
+ if (s_set < s_min) /* get the item set support and */
427
+ break; /* if it is too low, abort the loop */
428
+ if (s_set >= s_body) /* if some subset has enough support */
429
+ body |= 4; /* for a rule body, set the body flag */
430
+ *--set = ID(curr); cnt++; /* add id of current node to the set */
431
+ } /* and adapt the number of items */
432
+ if (!curr->parent && body) /* if subset support is high enough */
433
+ ist->map[n++] = k; /* for a full rule and a rule body, */
434
+ } /* note the item identifier */
435
+ if (n <= 0) return NULL; /* if no child is needed, abort */
436
+ #ifdef BENCH /* if benchmark version: */
437
+ ist->scnec += n; /* sum the necessary counters */
438
+ #endif
439
+
440
+ /* --- decide on node structure --- */
441
+ k = ist->map[n-1] -ist->map[0] +1;
442
+ if (!(ist->mode & IST_MEMOPT)) n = k;
443
+ else if (3*n >= 2*k) n = k; /* use a pure vector if it is small */
444
+ else k = n+n; /* enough, otherwise use an id. map */
445
+ #ifdef ARCH64 /* if 64 bit architecture */
446
+ if ((n == k) && (k & 1)) n = ++k;
447
+ #endif /* pad to even number of counters */
448
+ #ifdef BENCH /* if benchmark version */
449
+ ist->sccnt += n; /* sum the number of counters */
450
+ ist->bytes += sizeof(ISNODE) +(k-1) *sizeof(int) +8;
451
+ #endif /* determine the memory usage */
452
+
453
+ /* --- create child --- */
454
+ curr = (ISNODE*)malloc(sizeof(ISNODE) +(k-1) *sizeof(int));
455
+ if (!curr) return (void*)-1; /* create a child node */
456
+ curr->parent = node; /* set pointer to parent node */
457
+ curr->succ = NULL; /* and clear successor pointer */
458
+ curr->id = item; /* initialize the item id. and */
459
+ if (hdonly) curr->id |= F_HDONLY; /* set the head only flag */
460
+ curr->chcnt = 0; /* there are no children yet */
461
+ curr->size = n; /* set size of counter vector */
462
+ if (n == k) /* if to use a pure vector, */
463
+ curr->offset = ist->map[0]; /* note the first item as an offset */
464
+ else { /* if to use an identifier map, */
465
+ curr->offset = -1; /* use the offset as an indicator */
466
+ for (set = curr->cnts +n +(i = n); --i >= 0; )
467
+ *--set = ist->map[i]; /* copy the identifier map */
468
+ } /* from the buffer to the node */
469
+ for (set = curr->cnts +(i = n); --i >= 0; )
470
+ *--set = 0; /* clear all counters of the node */
471
+ return curr; /* return pointer to created child */
472
+ } /* _child() */
473
+
474
+ /*----------------------------------------------------------------------
475
+ In the above function the set S represented by the index-th vector
476
+ element of the current node is extended only by combining it with the
477
+ sets represented by the fields that follow it in the node vector,
478
+ i.e. by the sets represented by vec[index+1] to vec[size-1]. The sets
479
+ that can be formed by combining the set S and the sets represented by
480
+ vec[0] to vec[index-1] are processed in the branches for these sets.
481
+ In the 'check candidates' loop it is checked for each set represented
482
+ by vec[index+1] to vec[size-1] whether this set and all other subsets
483
+ of the same size, which can be formed from the union of this set and
484
+ the set S, have enough support, so that a child node is necessary.
485
+ Note that i +offset is the identifier of the item that has to be
486
+ added to set S to form the union of the set S and the set T represented
487
+ by vec[i], since S and T have the same path with the exception of the
488
+ index in the current node. Hence we can speak of candidate items that
489
+ are added to S.
490
+ Checking the support of the other subsets of the union of S and T
491
+ that have the same size as S and T is done with the aid of a path
492
+ variable. The items in this variable combined with the items on the
493
+ path to the current node always represent the subset currently tested.
494
+ That is, the path variable holds the path to be followed from the
495
+ current node to arrive at the support counter for the subset. The path
496
+ variable is initialized to [0]: <item>, [1]: <offset+i>, since the
497
+ support counters for S and T can be inspected directly. Then this
498
+ path is followed from the parent node of the current node, which is
499
+ equivalent to checking the subset that can be obtained by removing
500
+ from the union of S and T the item that corresponds to the parent node
501
+ (in the path to S or T, resp.).
502
+ Iteratively making the parent node the current node, adding its
503
+ corresponding item to the path and checking the support counter at the
504
+ end of the path variable when starting from its (the new current node's)
505
+ parent node tests all other subsets.
506
+ Another criterion is that the extended set must not contain two items
507
+ which may appear only in the head of a rule. If two such items are
508
+ contained in a set, neither can a rule be formed from its items nor can
509
+ it be the antecedent of a rule. Whether a set contains two head only
510
+ items is determined from the nodes 'hdonly' flag and the appearance
511
+ flags of the items.
512
+ ----------------------------------------------------------------------*/
513
+
514
+ static void _cleanup (ISTREE *ist)
515
+ { /* --- clean up on error */
516
+ ISNODE *node, *t; /* to traverse the nodes */
517
+
518
+ assert(ist); /* check the function argument */
519
+ for (node = ist->lvls[ist->height]; node; ) {
520
+ t = node; node = node->succ; free(t); }
521
+ ist->lvls[ist->height] = NULL;/* delete all created nodes */
522
+ for (node = ist->lvls[ist->height -1]; node; node = node->succ)
523
+ node->chcnt = 0; /* clear the child node counters */
524
+ } /* _cleanup() */ /* of the deepest nodes in the tree */
525
+
526
+ /*----------------------------------------------------------------------
527
+ Additional Rule Evaluation Measure Functions
528
+ ----------------------------------------------------------------------*/
529
+
530
+ static double _none (int set, int body, int head, int n)
531
+ { return 1; } /* --- no add. evaluation measure */
532
+
533
+ /*--------------------------------------------------------------------*/
534
+
535
+ static double _diff (int set, int body, int head, int n)
536
+ { /* --- absolute confidence difference */
537
+ return fabs(head/(double)n -set/(double)body);
538
+ } /* _diff() */
539
+
540
+ /*--------------------------------------------------------------------*/
541
+
542
+ static double _quot (int set, int body, int head, int n)
543
+ { /* --- diff. of conf. quotient to 1 */
544
+ double t; /* temporary buffer */
545
+
546
+ if ((head <= 0) || (body <= 0)) return 0;
547
+ t = (set/(double)body) /(head/(double)n);
548
+ return 1 -((t > 1) ? 1/t : t); /* return the confidence quotient */
549
+ } /* _quot() */
550
+
551
+ /*--------------------------------------------------------------------*/
552
+
553
+ static double _aimp (int set, int body, int head, int n)
554
+ { /* --- abs. diff. of improvement to 1 */
555
+ if ((head <= 0) || (body <= 0)) return 0;
556
+ return fabs((set/(double)body) /(head/(double)n) -1);
557
+ } /* _aimp() */
558
+
559
+ /*--------------------------------------------------------------------*/
560
+
561
+ static double _info (int set, int body, int head, int n)
562
+ { /* --- information diff. to prior */
563
+ double sum, t; /* result, temporary buffer */
564
+
565
+ if ((head <= 0) || (head >= n)
566
+ || (body <= 0) || (body >= n))
567
+ return 0; /* check for strict positivity */
568
+ sum = 0; /* support of head and body */
569
+ if (set > 0) sum += set *log(set /( head *(double) body));
570
+ t = body -set; /* support of not head and body */
571
+ if (t > 0) sum += t *log(t /((n-head) *(double) body));
572
+ t = head -set; /* support of head and not body */
573
+ if (t > 0) sum += t *log(t /( head *(double)(n-body)));
574
+ t = n -head -body +set; /* support of not head and not body */
575
+ if (t > 0) sum += t *log(t /((n-head) *(double)(n-body)));
576
+ return (log(n) +sum/n) /LN_2; /* return information gain in bits */
577
+ } /* _info() */
578
+
579
+ /*--------------------------------------------------------------------*/
580
+
581
+ static double _chi2 (int set, int body, int head, int n)
582
+ { /* --- normalized chi^2 measure */
583
+ double t; /* temporary buffer */
584
+
585
+ if ((head <= 0) || (head >= n)
586
+ || (body <= 0) || (body >= n))
587
+ return 0; /* check for strict positivity */
588
+ t = head *(double)body -set *(double)n;
589
+ return (t*t) / (((double)head) *(n-head) *body *(n-body));
590
+ } /* _chi2() */ /* compute and return chi^2 measure */
591
+
592
+ /*--------------------------------------------------------------------*/
593
+
594
+ static double _pval (int set, int body, int head, int n)
595
+ { /* --- p-value from chi^2 measure */
596
+ return chi2cdf(n*_chi2(set, body, head, n), 1);
597
+ } /* _pval() */
598
+
599
+ /*--------------------------------------------------------------------*/
600
+
601
+ static EVALFN *_evalfns[EM_UNKNOWN] = {
602
+ /* EM_NONE 0 */ _none, /* no additional evaluation measure */
603
+ /* EM_DIFF 1 */ _diff, /* absolute confidence difference */
604
+ /* EM_QUOT 2 */ _quot, /* difference of conf. quotient to 1 */
605
+ /* EM_AIMP 3 */ _aimp, /* abs. diff. of improvement to 1 */
606
+ /* EM_INFO 4 */ _info, /* information difference to prior */
607
+ /* EM_CHI2 5 */ _chi2, /* normalized chi^2 measure */
608
+ /* EM_PVAL 6 */ _pval, /* p-value of chi^2 measure */
609
+ }; /* table of evaluation functions */
610
+
611
+ /*----------------------------------------------------------------------
612
+ Main Functions
613
+ ----------------------------------------------------------------------*/
614
+
615
+ ISTREE* ist_create (ITEMSET *set, int mode, int supp, double conf)
616
+ { /* --- create an item set tree */
617
+ int cnt, n; /* number of items, buffer */
618
+ ISTREE *ist; /* created item set tree */
619
+ ISNODE *root; /* root node of the tree */
620
+
621
+ assert(set /* check the function arguments */
622
+ && (supp >= 0) && (conf >= 0) && (conf <= 1));
623
+
624
+ /* --- allocate memory --- */
625
+ cnt = is_cnt(set); /* get the number of items */
626
+ ist = (ISTREE*)malloc(sizeof(ISTREE));
627
+ if (!ist) return NULL; /* allocate the tree body */
628
+ ist->lvls = (ISNODE**)malloc(BLKSIZE *sizeof(ISNODE*));
629
+ if (!ist->lvls) { free(ist); return NULL; }
630
+ ist->buf = (int*) malloc(BLKSIZE *sizeof(int));
631
+ if (!ist->buf) { free(ist->lvls); free(ist); return NULL; }
632
+ ist->map = (int*) malloc(cnt *sizeof(int));
633
+ if (!ist->map) { free(ist->buf);
634
+ free(ist->lvls); free(ist); return NULL; }
635
+ #ifdef ARCH64 /* if 64 bit architecture, */
636
+ n = cnt +(cnt & 1); /* pad counters to even number */
637
+ #else /* on 32 bit systems, however, */
638
+ n = cnt; /* use the number of items directly */
639
+ #endif
640
+ ist->lvls[0] = ist->curr = /* allocate a root node */
641
+ root = (ISNODE*)calloc(1, sizeof(ISNODE) +(n-1) *sizeof(int));
642
+ if (!root) { free(ist->map); free(ist->buf);
643
+ free(ist->lvls); free(ist); return NULL; }
644
+
645
+ /* --- initialize structures --- */
646
+ ist->set = set; /* copy parameters to the structure */
647
+ ist->mode = mode;
648
+ ist->tacnt = is_gettac(set);
649
+ ist->vsz = BLKSIZE;
650
+ ist->height = 1;
651
+ ist->rule = (supp > 0) ? supp : 1;
652
+ if (mode & IST_HEAD) supp = (int)ceil(conf *supp);
653
+ ist->supp = (supp > 0) ? supp : 1;
654
+ ist->conf = conf;
655
+ #ifdef BENCH /* if benchmark version */
656
+ ist->sccnt = ist->scnec = cnt;
657
+ ist->cpcnt = ist->cpnec = 0;
658
+ ist->bytes = sizeof(ISTREE) +cnt *sizeof(char) +8
659
+ + BLKSIZE *sizeof(ISNODE*) +8
660
+ + BLKSIZE *sizeof(int) +8
661
+ + cnt *sizeof(int) +8;
662
+ #endif /* initialize the benchmark variables */
663
+ ist_init(ist, 1, EM_NONE, 1); /* initialize rule extraction */
664
+ root->parent = root->succ = NULL;
665
+ root->offset = root->id = 0;
666
+ root->chcnt = 0; /* initialize the root node */
667
+ root->size = n;
668
+ while (--cnt >= 0) /* copy the item frequencies */
669
+ root->cnts[cnt] = is_getfrq(set, cnt);
670
+ return ist; /* return created item set tree */
671
+ } /* ist_create() */
672
+
673
+ /*--------------------------------------------------------------------*/
674
+
675
+ void ist_delete (ISTREE *ist)
676
+ { /* --- delete an item set tree */
677
+ int i; /* loop variables */
678
+ ISNODE *node, *t; /* to traverse the nodes */
679
+
680
+ assert(ist); /* check the function argument */
681
+ for (i = ist->height; --i >= 0; ) {
682
+ for (node = ist->lvls[i]; node; ) {
683
+ t = node; node = node->succ; free(t); }
684
+ } /* delete all nodes, */
685
+ free(ist->lvls); /* the level vector, */
686
+ free(ist->map); /* the identifier map, */
687
+ free(ist->buf); /* the path buffer, */
688
+ free(ist); /* and the tree body */
689
+ } /* ist_delete() */
690
+
691
+ /*--------------------------------------------------------------------*/
692
+
693
+ void ist_count (ISTREE *ist, int *set, int cnt)
694
+ { /* --- count transaction in tree */
695
+ assert(ist /* check the function arguments */
696
+ && (cnt >= 0) && (set || (cnt <= 0)));
697
+ if (cnt >= ist->height) /* recursively count transaction */
698
+ _count(ist->lvls[0], set, cnt, ist->height);
699
+ } /* ist_count() */
700
+
701
+ /*--------------------------------------------------------------------*/
702
+
703
+ void ist_countx (ISTREE *ist, TATREE *tat)
704
+ { /* --- count transaction in tree */
705
+ assert(ist && tat); /* check the function arguments */
706
+ _countx(ist->lvls[0], tat, ist->height);
707
+ } /* ist_countx() */ /* recursively count the trans. tree */
708
+
709
+ /*--------------------------------------------------------------------*/
710
+
711
+ int ist_check (ISTREE *ist, char *marks)
712
+ { /* --- check item usage */
713
+ int i, n; /* loop variable, number of items */
714
+
715
+ assert(ist); /* check the function argument */
716
+ for (i = ist->lvls[0]->size; --i >= 0; )
717
+ marks[i] = 0; /* clear the marker vector */
718
+ _checkuse(ist->lvls[0], marks, ist->supp);
719
+ for (n = 0, i = ist->lvls[0]->size; --i >= 0; )
720
+ if (marks[i]) n++; /* count used items */
721
+ return n; /* and return this number */
722
+ } /* ist_check() */
723
+
724
+ /*--------------------------------------------------------------------*/
725
+
726
+ int ist_addlvl (ISTREE *ist)
727
+ { /* --- add a level to item set tree */
728
+ int i, n, c; /* loop variable, counter, buffer */
729
+ ISNODE **ndp; /* to traverse the nodes */
730
+ ISNODE *node; /* new (reallocated) node */
731
+ ISNODE **end; /* end of new level node list */
732
+ ISNODE *cur; /* current node in new level */
733
+ ISNODE *frst; /* first child of current node */
734
+ ISNODE *last; /* last child of current node */
735
+ ISNODE **vec; /* child node vector */
736
+ int *map; /* identifier map */
737
+ void *p; /* temporary buffer */
738
+
739
+ assert(ist); /* check the function arguments */
740
+
741
+ /* --- enlarge level vector --- */
742
+ if (ist->height >= ist->vsz){ /* if the level vector is full */
743
+ n = ist->vsz +BLKSIZE; /* compute new vector size */
744
+ p = realloc(ist->lvls, n *sizeof(ISNODE*));
745
+ if (!p) return -1; /* enlarge the level vector */
746
+ ist->lvls = (ISNODE**)p; /* and set the new vector */
747
+ p = realloc(ist->buf, n *sizeof(int));
748
+ if (!p) return -1; /* enlarge the buffer vector */
749
+ ist->buf = (int*)p; /* and set the new vector */
750
+ ist->vsz = n; /* set the new vector size */
751
+ } /* (applies to buf and levels) */
752
+ end = ist->lvls +ist->height;
753
+ *end = NULL; /* start a new tree level */
754
+
755
+ /* --- add tree level --- */
756
+ for (ndp = ist->lvls +ist->height -1; *ndp; ndp = &(*ndp)->succ) {
757
+ frst = last = NULL; /* traverse the deepest nodes */
758
+ for (i = n = 0; i < (*ndp)->size; i++) {
759
+ cur = _child(ist, *ndp, i, ist->supp, ist->rule);
760
+ if (!cur) continue; /* create a child if necessary */
761
+ if (cur == (void*)-1) { _cleanup(ist); return -1; }
762
+ if (!frst) frst = cur; /* note first and last child node */
763
+ *end = last = cur; /* add node at the end of the list */
764
+ end = &cur->succ; n++; /* that contains the new level */
765
+ } /* and advance end pointer */
766
+ if (n <= 0) { /* if no child node was created, */
767
+ (*ndp)->chcnt = F_SKIP; continue; } /* skip the node */
768
+ #ifdef BENCH /* if benchmark version */
769
+ ist->cpnec += n; /* sum the number of necessary */
770
+ #endif /* child pointers */
771
+ node = *ndp; /* decide on the node structure: */
772
+ if (node->offset >= 0) { /* if a pure counter vector is used, */
773
+ n = ID(last)-ID(frst)+1; /* always add a pure child vector */
774
+ i = (node->size -1) *sizeof(int) +n *sizeof(ISNODE*); }
775
+ else if (2*n > node->size){ /* if a single id. map is best, */
776
+ n = node->size; /* only add a child vector */
777
+ i = (n+n-1) *sizeof(int) +n *sizeof(ISNODE*); }
778
+ else { /* if two identifier maps are best, */
779
+ i = node->size; /* add a child vector and a map */
780
+ i = (i+i-1) *sizeof(int) +n *(sizeof(ISNODE*) +sizeof(int));
781
+ } /* get size of additional vectors */
782
+ node = (ISNODE*)realloc(node, sizeof(ISNODE) +i);
783
+ if (!node) { _cleanup(ist); return -1; }
784
+ node->chcnt = n; /* add a child vector to the node */
785
+ #ifdef BENCH /* if benchmark version */
786
+ ist->cpcnt += n; /* sum the number of child pointers */
787
+ if ((node->offset >= 0) || (node->size == n))
788
+ ist->bytes += n * sizeof(ISNODE*);
789
+ else ist->bytes += n *(sizeof(ISNODE*) +sizeof(int));
790
+ #endif /* determine the memory usage */
791
+ if ((node != *ndp) && node->parent) {
792
+ last = node->parent; /* adapt the ref. from the parent */
793
+ if (last->offset >= 0) { /* if a pure vector is used */
794
+ vec = (ISNODE**)(last->cnts +last->size);
795
+ vec[(vec[0] != *ndp) ? ID(node) -ID(vec[0]) : 0] = node; }
796
+ else { /* if an identifier map is used */
797
+ map = last->cnts +(i = last->size);
798
+ vec = (ISNODE**)(map+i);/* get identifier map, child vector, */
799
+ c = last->chcnt & ~F_SKIP; /* and the number of children */
800
+ if (c < i) /* if a secondary id. map exists, */
801
+ map = (int*)(vec +(i = c)); /* get this identifier map */
802
+ vec[_bsearch(map, i, ID(node))] = node;
803
+ } /* find the proper index and */
804
+ } /* set the new child pointer */
805
+ *ndp = node; /* set new (reallocated) node */
806
+ if (node->offset >= 0) { /* if to use pure vectors */
807
+ vec = (ISNODE**)(node->cnts +node->size);
808
+ while (--n >= 0) vec[n] = NULL;
809
+ i = ID(frst); /* get item identifier of first child */
810
+ for (cur = frst; cur; cur = cur->succ) {
811
+ vec[ID(cur)-i] = cur; /* set the child node pointer */
812
+ cur->parent = node; /* and the parent pointer */
813
+ } } /* in the new node */
814
+ else if (n < node->size) { /* if two identifier maps are used */
815
+ vec = (ISNODE**)(node->cnts +node->size +node->size);
816
+ map = (int*)(vec +n); /* get the secondary identifier map */
817
+ for (i = 0, cur = frst; cur; cur = cur->succ) {
818
+ vec[i] = cur; /* set the child node pointer, */
819
+ map[i++] = ID(cur); /* the identifier map entry, */
820
+ cur->parent = node; /* and the parent pointer */
821
+ } } /* in the new node */
822
+ else { /* if one identifier map is used */
823
+ map = node->cnts +(i = node->size);
824
+ vec = (ISNODE**)(map +i); /* get id. map and child vector */
825
+ while (--n >= 0) vec[n] = NULL;
826
+ for (cur = frst; cur; cur = cur->succ) {
827
+ vec[_bsearch(map, i, ID(cur))] = cur;
828
+ cur->parent = node; /* set the child node pointer */
829
+ } /* and the parent pointer */
830
+ } /* in the new node */
831
+ }
832
+ if (!ist->lvls[ist->height]) /* if no child has been added, */
833
+ return 1; /* abort the function, otherwise */
834
+ ist->height++; /* increment the level counter */
835
+ _checksub(ist->lvls[0]); /* check for unnecessary subtrees */
836
+ return 0; /* return 'ok' */
837
+ } /* ist_addlvl() */
838
+
839
+ /*--------------------------------------------------------------------*/
840
+
841
+ void ist_up (ISTREE *ist, int root)
842
+ { /* --- go up in item set tree */
843
+ assert(ist && ist->curr); /* check the function argument */
844
+ if (root) /* if root flag set, */
845
+ ist->curr = ist->lvls[0]; /* go to the root node */
846
+ else if (ist->curr->parent) /* if it exists, go to the parent */
847
+ ist->curr = ist->curr->parent;
848
+ } /* ist_up() */
849
+
850
+ /*--------------------------------------------------------------------*/
851
+
852
+ int ist_down (ISTREE *ist, int item)
853
+ { /* --- go down in item set tree */
854
+ ISNODE *node; /* the current node */
855
+ ISNODE **vec; /* child node vector of current node */
856
+ int *map, n; /* identifier map and its size */
857
+ int c; /* number of children */
858
+
859
+ assert(ist && ist->curr); /* check the function argument */
860
+ node = ist->curr; /* get the current node */
861
+ c = node->chcnt & ~F_SKIP; /* if there are no child nodes, */
862
+ if (c <= 0) return -1; /* abort the function */
863
+ if (node->offset >= 0) { /* if a pure vector is used */
864
+ vec = (ISNODE**)(node->cnts +node->size);
865
+ item -= ID(vec[0]); /* compute index in child node vector */
866
+ if (item >= c) return -1; } /* and abort if there is no child */
867
+ else { /* if an identifier map is used */
868
+ map = node->cnts +(n = node->size);
869
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
870
+ if (c < n) /* if a secondary id. map exists, */
871
+ map = (int*)(vec +(n = c)); /* get this identifier map */
872
+ item = _bsearch(map, n, item);
873
+ } /* search for the proper index */
874
+ if ((item < 0) || !vec[item]) /* if the index is out of range */
875
+ return -1; /* or the child does not exist, abort */
876
+ ist->curr = vec[item]; /* otherwise go to the child node */
877
+ return 0; /* return 'ok' */
878
+ } /* ist_down() */
879
+
880
+ /*--------------------------------------------------------------------*/
881
+
882
+ int ist_next (ISTREE *ist, int item)
883
+ { /* --- get next item with a counter */
884
+ int i; /* vector index */
885
+ ISNODE *node; /* the current node */
886
+ int *map, n; /* identifier map and its size */
887
+
888
+ assert(ist && ist->curr); /* check the function argument */
889
+ node = ist->curr; /* get the current node */
890
+ if (node->offset >= 0) { /* if a pure vector is used, */
891
+ if (item < node->offset) return node->offset;
892
+ if (item >= node->offset +node->size) return -1;
893
+ return item +1; } /* return the next item identifier */
894
+ else { /* if an identifier map is used */
895
+ map = node->cnts +(n = node->size);
896
+ if (item < map[0]) return map[0];
897
+ if (item >= map[n-1]) return -1;
898
+ i = _bsearch(map, n, item); /* try to find the item directly */
899
+ if (i >= 0) return map[i+1];/* and return the following one */
900
+ while ((--n >= 0) && (*map > item)) map++;
901
+ return (n >= 0) ? *map :-1; /* search iteratively for the next */
902
+ } /* item identifier and return it */
903
+ } /* ist_next() */
904
+
905
+ /*--------------------------------------------------------------------*/
906
+
907
+ void ist_setcnt (ISTREE *ist, int item, int cnt)
908
+ { /* --- set counter for an item */
909
+ ISNODE *node; /* the current node */
910
+ ISNODE **vec; /* child node vector of current node */
911
+ int *map, n; /* identifier map and its size */
912
+ int c; /* number of children */
913
+
914
+ assert(ist && ist->curr); /* check the function argument */
915
+ node = ist->curr; /* get the current node */
916
+ if (node->offset >= 0) { /* if a pure vector is used, */
917
+ item -= node->offset; /* get index in counter vector */
918
+ if (item >= node->size) return; }
919
+ else { /* if an identifier map is used */
920
+ map = node->cnts +(n = node->size);
921
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
922
+ c = node->chcnt & ~F_SKIP; /* and the number of children */
923
+ if (c < n) /* if a secondary id. map exists, */
924
+ map = (int*)(vec +(n = c)); /* get this identifier map */
925
+ item = _bsearch(map, n, item);
926
+ } /* search for the proper index */
927
+ if (item >= 0) node->cnts[item] = cnt;
928
+ } /* ist_setcnt() */ /* set the frequency counter */
929
+
930
+ /*--------------------------------------------------------------------*/
931
+
932
+ int ist_getcnt (ISTREE *ist, int item)
933
+ { /* --- get counter for an item */
934
+ ISNODE *node; /* the current node */
935
+ ISNODE **vec; /* child node vector of current node */
936
+ int *map, n; /* identifier map and its size */
937
+ int c; /* number of children */
938
+
939
+ assert(ist && ist->curr); /* check the function argument */
940
+ node = ist->curr; /* get the current node */
941
+ if (node->offset >= 0) { /* if pure vectors are used, */
942
+ item -= node->offset; /* get index in counter vector */
943
+ if (item >= node->size) return -1; }
944
+ else { /* if an identifier map is used */
945
+ map = node->cnts +(n = node->size);
946
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
947
+ c = node->chcnt & ~F_SKIP; /* and the number of children */
948
+ if (c < n) /* if a secondary id. map exists, */
949
+ map = (int*)(vec +(n = c)); /* get this identifier map */
950
+ item = _bsearch(map, n, item);
951
+ } /* search for the proper index */
952
+ if (item < 0) return -1; /* abort if index is out of range */
953
+ return node->cnts[item]; /* return the value of the counter */
954
+ } /* ist_getcnt() */
955
+
956
+ /*--------------------------------------------------------------------*/
957
+
958
+ int ist_getcntx (ISTREE *ist, int *set, int cnt)
959
+ { /* --- get counter for an item set */
960
+ assert(ist /* check the function arguments */
961
+ && (cnt >= 0) && (set || (cnt <= 0)));
962
+ if (cnt <= 0) /* if the item set is empty, */
963
+ return ist->tacnt; /* return the transaction count */
964
+ return COUNT(_getsupp(ist->lvls[0], set, cnt));
965
+ } /* ist_getcntx() */ /* return the item set support */
966
+
967
+ /*--------------------------------------------------------------------*/
968
+
969
+ void ist_filter (ISTREE *ist, int mode)
970
+ { /* --- filter frequent item sets */
971
+ int i, k; /* loop variables */
972
+ ISNODE *node; /* to traverse the nodes */
973
+ int supp; /* support of an item set */
974
+
975
+ assert(ist); /* check the function argument */
976
+ if (mode == IST_CLEAR) { /* if to clear all skip flags */
977
+ for (k = 1; k < ist->height; k++)
978
+ for (node = ist->lvls[k]; node; node = node->succ)
979
+ for (i = 0; i < node->size; i++)
980
+ node->cnts[i] &= ~F_SKIP;
981
+ return; /* clear all skip flags */
982
+ } /* and abort the function */
983
+ supp = -1; /* set default support filter */
984
+ for (k = 1; k < ist->height; k++) {
985
+ for (node = ist->lvls[k]; node; node = node->succ) {
986
+ for (i = 0; i < node->size; i++) {
987
+ if (node->cnts[i] < ist->supp)
988
+ continue; /* skip infrequent item sets */
989
+ if (mode == IST_CLOSED) supp = node->cnts[i];
990
+ _marksub(ist, node, i, supp);
991
+ } /* mark all n-1 subsets */
992
+ } /* of the current item set */
993
+ } /* that have to be cleared/marked */
994
+ } /* ist_filter() */
995
+
996
+ /*--------------------------------------------------------------------*/
997
+
998
+ void ist_init (ISTREE *ist, int minlen, int arem, double minval)
999
+ { /* --- initialize (rule) extraction */
1000
+ assert(ist /* check the function arguments */
1001
+ && (minlen > 0) && (minval >= 0.0) && (minval <= 1.0));
1002
+ ist->item = ist->index = -1; /* initialize rule extraction */
1003
+ ist->node = ist->lvls[minlen -1];
1004
+ ist->size = minlen;
1005
+ ist->head = NULL;
1006
+ if ((arem < EM_NONE) || (arem >= EM_UNKNOWN))
1007
+ arem = EM_NONE; /* check, adapt, and note */
1008
+ ist->arem = arem; /* additional evaluation measure */
1009
+ ist->minval = minval; /* and its minimal value */
1010
+ } /* ist_init() */
1011
+
1012
+ /*--------------------------------------------------------------------*/
1013
+
1014
+ int ist_set (ISTREE *ist, int *set, int *supp, double *aval)
1015
+ { /* --- extract next frequent item set */
1016
+ int i; /* loop variable */
1017
+ int item; /* an item identifier */
1018
+ ISNODE *node, *tmp; /* current item set node, buffer */
1019
+ int *cnts; /* to access the item frequencies */
1020
+ int s_set; /* support of the current set */
1021
+ double dev; /* deviation from indep. occurrence */
1022
+
1023
+ assert(ist && set && supp); /* check the function arguments */
1024
+ if (ist->size > ist->height) /* if the tree is not high enough */
1025
+ return -1; /* for the item set size, abort */
1026
+
1027
+ /* --- find frequent item set --- */
1028
+ node = ist->node; /* get the current item set node */
1029
+ while (1) { /* search for a frequent item set */
1030
+ if (++ist->index >= node->size) { /* if all subsets have been */
1031
+ node = node->succ; /* processed, go to the successor */
1032
+ if (!node) { /* if at the end of a level, go down */
1033
+ if (++ist->size > ist->height)
1034
+ return -1; /* if beyond the deepest level, abort */
1035
+ node = ist->lvls[ist->size -1];
1036
+ } /* get the 1st node of the new level */
1037
+ ist->node = node; /* note the new item set node */
1038
+ ist->index = 0; /* start with the first item set */
1039
+ } /* of the new item set node */
1040
+ if (node->offset >= 0) item = node->offset +ist->index;
1041
+ else item = node->cnts[node->size +ist->index];
1042
+ if (is_getapp(ist->set, item) == IST_IGNORE)
1043
+ continue; /* skip items to ignore */
1044
+ s_set = node->cnts[ist->index];
1045
+ if (s_set < ist->supp) /* if the support is not sufficient, */
1046
+ continue; /* go to the next item set */
1047
+ /* Note that this check automatically skips all item sets that */
1048
+ /* are marked with the flag F_SKIP, because s_set is negative */
1049
+ /* with this flag and thus necessarily smaller than ist->supp. */
1050
+ dev = 0; /* init. add. evaluation measure */
1051
+ if (ist->arem == EM_DIFF) { /* if logarithm of support quotient */
1052
+ cnts = ist->lvls[0]->cnts;
1053
+ dev = log(s_set) -log(COUNT(cnts[item]));
1054
+ for (tmp = node; tmp->parent; tmp = tmp->parent)
1055
+ dev -= log(COUNT(cnts[ID(tmp)]));
1056
+ dev = (dev +(ist->size-1) *log(ist->tacnt)) *(0.01/LN_2);
1057
+ if (dev < ist->minval) /* if the value of the additional */
1058
+ continue; /* eval. measure is not high enough, */
1059
+ } /* skip the item set */
1060
+ break; /* otherwise abort the search loop */
1061
+ }
1062
+ *supp = s_set; /* store the item set support and */
1063
+ if (aval) *aval = dev; /* the value of the add. measure */
1064
+
1065
+ /* --- build frequent item set --- */
1066
+ i = ist->size; /* get the current item set size */
1067
+ set[--i] = item; /* and store the first item */
1068
+ while (node->parent) { /* while not at the root node */
1069
+ set[--i] = ID(node); /* add item to the item set */
1070
+ node = node->parent; /* and go to the parent node */
1071
+ }
1072
+ return ist->size; /* return the item set size */
1073
+ } /* ist_set() */
1074
+
1075
+ /*--------------------------------------------------------------------*/
1076
+
1077
+ int ist_rule (ISTREE *ist, int *rule,
1078
+ int *supp, double *conf, double *lift, double *aval)
1079
+ { /* --- extract next rule */
1080
+ int i; /* loop variable */
1081
+ int item; /* an item identifier */
1082
+ ISNODE *node; /* current item set node */
1083
+ ISNODE *parent; /* parent of the item set node */
1084
+ int *map, n; /* identifier map and its size */
1085
+ int s_set; /* support of set (body & head) */
1086
+ int s_body; /* support of body (antecedent) */
1087
+ int s_head; /* support of head (consequent) */
1088
+ double c, v; /* confidence and measure value */
1089
+ int app; /* appearance flag of head item */
1090
+
1091
+ assert(ist && rule && supp); /* check the function arguments */
1092
+ if (ist->size > ist->height) /* if the tree is not high enough */
1093
+ return -1; /* for the rule length, abort */
1094
+
1095
+ /* --- find rule --- */
1096
+ node = ist->node; /* get the current item set node */
1097
+ while (1) { /* search for a rule */
1098
+ if (ist->item >= 0) { /* --- select next item subset */
1099
+ *--ist->path = ist->item; /* add previous head to the path */
1100
+ ist->plen++; /* and get the next head item */
1101
+ ist->item = ID(ist->head);
1102
+ ist->head = ist->head->parent;
1103
+ if (!ist->head) /* if all subsets have been processed */
1104
+ ist->item = -1; /* clear the head item to trigger the */
1105
+ } /* selection of a new item set */
1106
+ if (ist->item < 0) { /* --- select next item set */
1107
+ if (++ist->index >= node->size){/* if all subsets have been */
1108
+ node = node->succ; /* processed, go to the successor */
1109
+ if (!node) { /* if at the end of a level, go down */
1110
+ if (++ist->size > ist->height)
1111
+ return -1; /* if beyond the deepest level, abort */
1112
+ node = ist->lvls[ist->size -1];
1113
+ } /* get the 1st node of the new level */
1114
+ ist->node = node; /* note the new item set node and */
1115
+ ist->index = 0; /* start with the first item set */
1116
+ } /* of the new item set node */
1117
+ if (node->offset >= 0) item = node->offset +ist->index;
1118
+ else item = node->cnts[node->size +ist->index];
1119
+ app = is_getapp(ist->set, item);
1120
+ if ((app == IST_IGNORE) || (HDONLY(node) && (app == IST_HEAD)))
1121
+ continue; /* skip sets with two head only items */
1122
+ ist->item = item; /* set the head item identifier */
1123
+ ist->hdonly = HDONLY(node) || (app == IST_HEAD);
1124
+ ist->head = node; /* set the new head item node */
1125
+ ist->path = ist->buf +ist->vsz;
1126
+ ist->plen = 0; /* clear the path */
1127
+ }
1128
+ app = is_getapp(ist->set, ist->item); /* get head item appearance */
1129
+ if (!(app & IST_HEAD) || (ist->hdonly && (app != IST_HEAD)))
1130
+ continue; /* if rule is not allowed, skip it */
1131
+ s_set = COUNT(node->cnts[ist->index]);
1132
+ if (s_set < ist->supp) { /* get and check the item set support */
1133
+ ist->item = -1; continue; }
1134
+ parent = node->parent; /* get the parent node */
1135
+ if (ist->plen > 0) /* if there is a path, use it */
1136
+ s_body = COUNT(_getsupp(ist->head, ist->path, ist->plen));
1137
+ else if (!parent) /* if there is no parent (root node), */
1138
+ s_body = ist->tacnt; /* get the number of transactions */
1139
+ else if (parent->offset >= 0) /* if a pure vector is used */
1140
+ s_body = COUNT(parent->cnts[ID(node) -parent->offset]);
1141
+ else { /* if an identifier map is used */
1142
+ map = parent->cnts +(n = parent->size);
1143
+ s_body = COUNT(parent->cnts[_bsearch(map, n, ID(node))]);
1144
+ } /* find vector index and get support */
1145
+ if (s_body < ist->rule) /* if the body support is too low, */
1146
+ continue; /* get the next subset/next set */
1147
+ c = s_set/(double)s_body; /* compute the rule confidence */
1148
+ if (c < ist->conf -EPSILON) /* if the confidence is too low, */
1149
+ continue; /* go to the next item (sub)set */
1150
+ s_head = COUNT(ist->lvls[0]->cnts[ist->item]);
1151
+ if (ist->arem == EM_NONE) { /* if no add. eval. measure given, */
1152
+ v = 0; break; } /* abort the loop (select the rule) */
1153
+ if (ist->size < 2) { /* if rule has an empty antecedent, */
1154
+ v = 0; break; } /* abort the loop (select the rule) */
1155
+ v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
1156
+ if (v >= ist->minval) /* if rule value exceeds the minimal */
1157
+ break; /* of the add. rule eval. measure, */
1158
+ } /* while (1) */ /* abort the loop (select rule) */
1159
+ *supp = (ist->mode & IST_HEAD) ? s_set : s_body;
1160
+ if (lift) /* compute and store the lift value */
1161
+ *lift = (c *ist->tacnt)/(double)s_head;
1162
+ if (conf) *conf = c; /* store the rule confidence and */
1163
+ if (aval) *aval = v; /* the value of the add. measure */
1164
+
1165
+ /* --- build rule --- */
1166
+ if (node->offset >= 0) item = node->offset +ist->index;
1167
+ else item = node->cnts[node->size +ist->index];
1168
+ i = ist->size; /* get the current item and */
1169
+ if (item != ist->item) /* if this item is not the head, */
1170
+ rule[--i] = item; /* add it to the rule body */
1171
+ while (node->parent) { /* traverse the path to the root */
1172
+ if (ID(node) != ist->item) /* and add all items on this */
1173
+ rule[--i] = ID(node); /* path to the rule body */
1174
+ node = node->parent; /* (except the head of the rule) */
1175
+ }
1176
+ rule[0] = ist->item; /* set the head of the rule, */
1177
+ return ist->size; /* return the rule size */
1178
+ } /* ist_rule() */
1179
+
1180
+ /*--------------------------------------------------------------------*/
1181
+
1182
+ int ist_hedge (ISTREE *ist, int *hedge,
1183
+ int *supp, double *conf, double *aval)
1184
+ { /* --- extract next hyperedge */
1185
+ int i; /* loop variable */
1186
+ int item; /* an item identifier */
1187
+ ISNODE *node; /* current item set node */
1188
+ ISNODE *head; /* node containing the rule head */
1189
+ int *map, n; /* identifier map and its size */
1190
+ int *path, plen; /* path in tree and its length */
1191
+ int s_set; /* support of set (body & head) */
1192
+ int s_body; /* support of body (antecedent) */
1193
+ int s_head; /* support of head (consequent) */
1194
+ double c, t, v = 0; /* confidence and measure value */
1195
+
1196
+ assert(ist && hedge && supp); /* check the function arguments */
1197
+ if (ist->size > ist->height) /* if the tree is not high enough */
1198
+ return -1; /* for the hyperedge size, abort */
1199
+
1200
+ /* --- find hyperedge --- */
1201
+ node = ist->node; /* get the current item set node */
1202
+ while (1) { /* search for a hyperedge */
1203
+ if (++ist->index >= node->size) { /* if all subsets have been */
1204
+ node = node->succ; /* processed, go to the successor */
1205
+ if (!node) { /* if at the end of a level, go down */
1206
+ if (++ist->size > ist->height)
1207
+ return -1; /* if beyond the deepest level, abort */
1208
+ node = ist->lvls[ist->size -1];
1209
+ } /* get the 1st node of the new level */
1210
+ ist->node = node; /* note the new item set node and */
1211
+ ist->index = 0; /* start with the first item set */
1212
+ } /* of the new item set node */
1213
+ if (node->offset >= 0) item = node->offset +ist->index;
1214
+ else item = node->cnts[node->size +ist->index];
1215
+ if (is_getapp(ist->set, item) == IST_IGNORE)
1216
+ continue; /* skip items to ignore */
1217
+ s_set = COUNT(node->cnts[ist->index]);
1218
+ if (s_set < ist->supp) /* if the set support is too low, */
1219
+ continue; /* skip this item set */
1220
+ head = node->parent; /* get subset support from parent */
1221
+ if (!head) /* if there is no parent (root node), */
1222
+ s_body = ist->tacnt; /* get the total number of sets */
1223
+ else if (head->offset >= 0) /* if pure vectors are used */
1224
+ s_body = head->cnts[ID(node) -head->offset];
1225
+ else { /* if an identifier map is used */
1226
+ map = head->cnts +(n = head->size);
1227
+ s_body = head->cnts[_bsearch(map, n, ID(node))];
1228
+ } /* find index and get the support */
1229
+ if (s_body & F_SKIP) { /* check for a valid body */
1230
+ node->cnts[ist->index] |= F_SKIP; continue; }
1231
+ s_body = COUNT(s_body); /* get the support of body and head */
1232
+ s_head = COUNT(ist->lvls[0]->cnts[item]);
1233
+ c = s_set/(double)s_body; /* compute confidence and add. eval. */
1234
+ v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
1235
+ item = ID(node); /* note the next head item */
1236
+ plen = 1; /* and initialize the path */
1237
+ path = ist->buf +ist->vsz; /* (store first item) */
1238
+ *--path = ist->index +node->offset;
1239
+ while (head) { /* traverse the path up to root */
1240
+ s_body = _getsupp(head, path, plen);
1241
+ if (s_body & F_SKIP) break;
1242
+ s_body = COUNT(s_body); /* get the support of the body */
1243
+ *--path = item; plen++; /* store the previous head item */
1244
+ item = ID(head); /* in the path (extend path) */
1245
+ c += s_set/(double)s_body;/* sum the rule confidences */
1246
+ s_head = COUNT(ist->lvls[0]->cnts[item]);
1247
+ t = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
1248
+ if (t < v) v = t; /* compute the add. evaluation */
1249
+ head = head->parent; /* and go to the parent node */
1250
+ } /* (get the next rule head) */
1251
+ c /= ist->size; /* average the rule confidences */
1252
+ if (!head /* check for a complete traversal */
1253
+ && (c > ist->conf -EPSILON)
1254
+ && (v > fabs(ist->minval) -EPSILON))
1255
+ break; /* check whether hyperedge qualifies */
1256
+ if ((ist->minval < 0) && node->parent)
1257
+ node->cnts[ist->index] |= F_SKIP;
1258
+ } /* while (1) */ /* otherwise mark it as skipped */
1259
+ *supp = s_set; /* store the hyperedge support, */
1260
+ if (conf) *conf = c; /* the average confidence and */
1261
+ if (aval) *aval = v; /* the value of the add. measure */
1262
+
1263
+ /* --- build hyperedge --- */
1264
+ i = ist->size -1; /* store the first item */
1265
+ if (node->offset >= 0) hedge[i] = ist->index +node->offset;
1266
+ else hedge[i] = node->cnts[node->size +ist->index];
1267
+ while (node->parent) { /* while not at the root node */
1268
+ hedge[--i] = ID(node); /* add item to the hyperedge */
1269
+ node = node->parent; /* and go to the parent node */
1270
+ }
1271
+ return ist->size; /* return the hyperedge size */
1272
+ } /* ist_hedge() */
1273
+
1274
+ /*--------------------------------------------------------------------*/
1275
+
1276
+ int ist_group (ISTREE *ist, int *group, int *supp, double *aval)
1277
+ { /* --- extract next group */
1278
+ int i; /* loop variable */
1279
+ int item; /* an item identifier */
1280
+ ISNODE *node; /* current item set node */
1281
+ ISNODE *head; /* node containing the rule head */
1282
+ int *map, n; /* identifier map and its size */
1283
+ int *path, plen; /* path in tree and its length */
1284
+ int s_set; /* support of set (body & head) */
1285
+ int s_body; /* support of body (antecedent) */
1286
+ int s_head; /* support of head (consequent) */
1287
+ double t, v = 0; /* additional measure value */
1288
+
1289
+ assert(ist && group && supp); /* check the function arguments */
1290
+ if (ist->item < 0) { /* if this is the first call */
1291
+ ist->size = ist->height; /* init. the extraction variables */
1292
+ ist->node = ist->lvls[ist->size -1]; ist->item = 0;
1293
+ }
1294
+ if (ist->size <= 1) /* if all groups are reported */
1295
+ return -1; /* for the hyperedge size, abort */
1296
+
1297
+ /* --- find next group --- */
1298
+ node = ist->node; /* get the current item set node */
1299
+ while (1) { /* search for a hyperedge */
1300
+ if (++ist->index >= node->size) { /* if all subsets have been */
1301
+ node = node->succ; /* processed, go to the successor */
1302
+ if (!node) { /* if at the end of a level, go down */
1303
+ if (--ist->size <= 1) /* if all groups are reported, */
1304
+ return -1; /* abort the extraction */
1305
+ node = ist->lvls[ist->size -1];
1306
+ } /* get the 1st node of the new level */
1307
+ ist->node = node; /* note the new item set node and */
1308
+ ist->index = 0; /* start with the first item set */
1309
+ } /* of the new item set node */
1310
+ if (node->offset >= 0) item = node->offset +ist->index;
1311
+ else item = node->cnts[node->size +ist->index];
1312
+ if (is_getapp(ist->set, item) == IST_IGNORE)
1313
+ continue; /* skip items to ignore */
1314
+ s_set = node->cnts[ist->index];
1315
+ if (s_set < ist->supp) { /* if the set support is too low */
1316
+ if (s_set & F_SKIP) _marksub(ist, node, ist->index, -1);
1317
+ continue; /* mark subsets if necessary */
1318
+ } /* and skip this item set */
1319
+ /* Note that this check automatically skips all item sets that */
1320
+ /* are marked with the flag F_SKIP, because s_set is negative */
1321
+ /* with this flag and thus necessarily smaller than ist->supp. */
1322
+ head = node->parent; /* get subset support from parent */
1323
+ if (!head) /* if there is no parent (root node), */
1324
+ s_body = ist->tacnt; /* get the total number of sets */
1325
+ else if (head->offset >= 0) /* if pure vectors are used */
1326
+ s_body = head->cnts[ID(node) -head->offset];
1327
+ else { /* if an identifier map is used */
1328
+ map = head->cnts +(n = head->size);
1329
+ s_body = head->cnts[_bsearch(map, n, ID(node))];
1330
+ } /* find index and get the support */
1331
+ s_body = COUNT(s_body); /* get the support of body and head */
1332
+ s_head = COUNT(ist->lvls[0]->cnts[item]);
1333
+ v = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
1334
+ item = ID(node); /* note the next head item */
1335
+ plen = 1; /* and initialize the path */
1336
+ path = ist->buf +ist->vsz; /* (store first item) */
1337
+ *--path = ist->index +node->offset;
1338
+ while (head) { /* traverse the path up to root */
1339
+ s_body = COUNT(_getsupp(head, path, plen));
1340
+ *--path = item; plen++; /* store the previous head item */
1341
+ item = ID(head); /* in the path (extend path) */
1342
+ s_head = COUNT(ist->lvls[0]->cnts[item]);
1343
+ t = _evalfns[ist->arem](s_set, s_body, s_head, ist->tacnt);
1344
+ if (t < v) v = t; /* compute the add. evaluation */
1345
+ head = head->parent; /* and go to the parent node */
1346
+ } /* (get the next rule head) */
1347
+ if (!head /* check for a complete traversal */
1348
+ && (v > fabs(ist->minval) -EPSILON))
1349
+ break; /* check whether group qualifies */
1350
+ } /* while (1) */
1351
+ *supp = s_set; /* store the group support and */
1352
+ if (aval) *aval = v; /* the value of the add. measure */
1353
+ _marksub(ist, node, ist->index, -1);
1354
+
1355
+ /* --- build hyperedge --- */
1356
+ i = ist->size -1; /* store the first item */
1357
+ if (node->offset >= 0) group[i] = ist->index +node->offset;
1358
+ else group[i] = node->cnts[node->size +ist->index];
1359
+ while (node->parent) { /* while not at the root node */
1360
+ group[--i] = ID(node); /* add item to the hyperedge */
1361
+ node = node->parent; /* and go to the parent node */
1362
+ }
1363
+ return ist->size; /* return the hyperedge size */
1364
+ } /* ist_group() */
1365
+
1366
+ /*--------------------------------------------------------------------*/
1367
+ #ifndef NDEBUG
1368
+
1369
+ static void _showtree (ISNODE *node, int level)
1370
+ { /* --- show subtree */
1371
+ int i, k; /* loop variables, buffer */
1372
+ int *map, n; /* identifier map and its size */
1373
+ int c; /* number of children */
1374
+ ISNODE **vec; /* vector of child nodes */
1375
+
1376
+ assert(node && (level >= 0)); /* check the function arguments */
1377
+ c = node->chcnt & ~F_SKIP; /* get the number of children */
1378
+ if (c <= 0) /* if there are no children, */
1379
+ vec = NULL; /* clear the child vector variable */
1380
+ else if (node->offset >= 0) /* if a pure vector is used */
1381
+ vec = (ISNODE**)(node->cnts +node->size);
1382
+ else { /* if an identifier map is used */
1383
+ map = node->cnts +(n = node->size);
1384
+ vec = (ISNODE**)(map +n); /* get id. map and child vector */
1385
+ if (c < n) /* if a secondary id. map exists, */
1386
+ map = (int*)(vec +(n = c)); /* get this identifier map */
1387
+ } /* get child access variables */
1388
+ for (i = 0; i < node->size; i++) {
1389
+ for (k = level; --k >= 0; ) /* indent and print */
1390
+ printf(" "); /* item identifier and counter */
1391
+ if (node->offset >= 0) k = node->offset +i;
1392
+ else k = node->cnts[node->size +i];
1393
+ printf("%d: %d\n", k, COUNT(node->cnts[i]));
1394
+ if (!vec) continue; /* check whether there are children */
1395
+ if (node->offset >= 0) k -= ID(vec[0]);
1396
+ else k = _bsearch(map, n, k);
1397
+ if ((k >= 0) && (k < c) && vec[k])
1398
+ _showtree(vec[k], level +1);
1399
+ } /* show subtree recursively */
1400
+ } /* _showtree() */
1401
+
1402
+ /*--------------------------------------------------------------------*/
1403
+
1404
+ void ist_show (ISTREE *ist)
1405
+ { /* --- show an item set tree */
1406
+ assert(ist); /* check the function argument */
1407
+ _showtree(ist->lvls[0], 0); /* show nodes recursively */
1408
+ printf("total: %d\n", ist->tacnt);
1409
+ } /* ist_show() */ /* print number of transactions */
1410
+
1411
+ #endif