RubyGems - see5-installer - Versions diffs - 0.1.0 - Mend

see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +7 -0
data/.gitignore +8 -0
data/.rubocop.yml +11 -0
data/CHANGELOG.md +5 -0
data/Gemfile +10 -0
data/README.md +29 -0
data/Rakefile +12 -0
data/ext/c5.0/Makefile +86 -0
data/ext/c5.0/attwinnow.c +394 -0
data/ext/c5.0/c50.c +330 -0
data/ext/c5.0/classify.c +700 -0
data/ext/c5.0/confmat.c +195 -0
data/ext/c5.0/construct.c +853 -0
data/ext/c5.0/contin.c +613 -0
data/ext/c5.0/defns.i +788 -0
data/ext/c5.0/discr.c +307 -0
data/ext/c5.0/extern.i +170 -0
data/ext/c5.0/formrules.c +720 -0
data/ext/c5.0/formtree.c +1158 -0
data/ext/c5.0/getdata.c +521 -0
data/ext/c5.0/getnames.c +733 -0
data/ext/c5.0/global.c +211 -0
data/ext/c5.0/gpl.txt +674 -0
data/ext/c5.0/implicitatt.c +1112 -0
data/ext/c5.0/info.c +146 -0
data/ext/c5.0/mcost.c +138 -0
data/ext/c5.0/modelfiles.c +952 -0
data/ext/c5.0/p-thresh.c +313 -0
data/ext/c5.0/prune.c +1069 -0
data/ext/c5.0/report.c +345 -0
data/ext/c5.0/rules.c +579 -0
data/ext/c5.0/ruletree.c +398 -0
data/ext/c5.0/siftrules.c +1285 -0
data/ext/c5.0/sort.c +156 -0
data/ext/c5.0/subset.c +599 -0
data/ext/c5.0/text.i +223 -0
data/ext/c5.0/trees.c +740 -0
data/ext/c5.0/update.c +129 -0
data/ext/c5.0/utility.c +1146 -0
data/ext/c5.0/xval +150 -0
data/ext/c5.0/xval.c +402 -0
data/ext/gritbot/Makefile +98 -0
data/ext/gritbot/check.c +1110 -0
data/ext/gritbot/cluster.c +342 -0
data/ext/gritbot/common.c +1269 -0
data/ext/gritbot/continatt.c +412 -0
data/ext/gritbot/defns.i +623 -0
data/ext/gritbot/discratt.c +459 -0
data/ext/gritbot/extern.i +101 -0
data/ext/gritbot/getdata.c +329 -0
data/ext/gritbot/getnames.c +573 -0
data/ext/gritbot/global.c +104 -0
data/ext/gritbot/gpl.txt +674 -0
data/ext/gritbot/gritbot.c +295 -0
data/ext/gritbot/implicitatt.c +1108 -0
data/ext/gritbot/inspect.c +794 -0
data/ext/gritbot/modelfiles.c +687 -0
data/ext/gritbot/outlier.c +415 -0
data/ext/gritbot/sort.c +130 -0
data/ext/gritbot/text.i +159 -0
data/ext/gritbot/update.c +126 -0
data/ext/gritbot/utility.c +1029 -0
data/ext/see5-installer/extconf.rb +25 -0
data/lib/see5/installer.rb +10 -0
data/lib/see5/installer/version.rb +7 -0
data/see5-installer.gemspec +30 -0
metadata +115 -0

data/ext/c5.0/xval ADDED Viewed

@@ -0,0 +1,150 @@
+#! /bin/csh
+#---------------------------------------------------------------------
+# Multi F-fold cross-validation script
+#---------------------------------------------------------------------
+#
+# Invocation:
+#   xval [C5.0 options] [F=folds] [R=repeats] [+label] [+d]
+#
+# Carries out R F-fold cross-validations
+#
+# If +d is used, individual results from each block are left in
+#     <filestem>.o<cross-validation no>[+label]
+# Averages over cross-validations are written to
+#     <filestem>.res[+label]
+#---------------------------------------------------------------------
+#	Sort the options into those applying to C5.0 and the rest
+set opts	=
+set folds	= 10
+set repeats	= 1
+set label	=
+set filestem	= undefined
+set rules       = 0
+set i = 1
+while ( $i <= $#argv )
+    set opt = $argv[$i]
+    switch ( $opt )
+    case "F=*":
+	set folds = `echo $opt | sed s/F=//`
+	breaksw
+    case "R=*":
+	set repeats = `echo $opt | sed s/R=//`
+	breaksw
+    case "+d":
+	set details
+	breaksw
+    case "+*":
+	set label = $opt
+	breaksw
+    case "-f":
+	@ i++
+	set filestem = $argv[$i]
+	breaksw
+    case "-f*":
+	set filestem = `echo $opt | sed s/-f//`
+	breaksw
+    case "-t":
+    case "-m":
+    case "-c":
+    case "-u":
+    case "-S":
+    case "-I":
+	@ i++
+	set opts = ( $opts ${opt}$argv[$i] )
+	breaksw
+    case "-b":
+    case "-p":
+    case "-e":
+    case "-t*":
+    case "-g":
+    case "-s":
+    case "-w":
+    case "-u*":
+    case "-m*":
+    case "-c*":
+    case "-S*":
+    case "-I*":
+	set opts = ( $opts $opt )
+	breaksw
+    case "-r":
+	set opts = ( $opts $opt )
+	set rules = 1
+	breaksw
+    case "-X":
+	@ i++
+	set folds = $argv[$i]
+	breaksw
+    case "-X*":
+	set folds = `echo $opt | sed s/-X//`
+	breaksw
+    default:
+	echo "unrecognised or inappropriate option" $opt
+    case "-h":
+	echo ""
+	echo "Summary of options for xval:"
+	echo ""
+	echo "    F=<f>         set f folds"
+	echo "    R=<r>         repeat r times"
+	echo "    +d            retain detailed files"
+	echo "    +s            label all output files with suffix +s"
+	echo ""
+	echo "    -f <filestem> application filestem"
+	echo "    -r            use rule-based classifiers"
+	echo "    -u <bands>    order rules by utility"
+	echo "    -w            invoke attribute winnowing"
+	echo "    -b            invoke 10-trial boosting"
+	echo "    -t <trials>   number of boosting trials"
+	echo "    -p            use soft thresholds"
+	echo "    -e            focus on errors (ignore costs file)"
+	echo "    -s            find subset tests for discrete atts"
+	echo "    -m <objs>     restrict allowable splits"
+	echo "    -c <CF>       confidence level for pruning"
+	echo "    -S <percent>  training sample percentage"
+	echo "    -X <folds>    cross-validate"
+	echo "    -I <integer>  random seed [ignored]"
+	echo "    -h            print this message"
+	exit 0
+    endsw
+    @ i++
+end
+#	Clear the summary file
+cp /dev/null $filestem.xsum
+#	Repeat cross-validations, incrementing the random seed
+set r = 0
+while ( $r < $repeats )
+    set outf = $filestem.o$r$label
+    c5.0 -f $filestem $opts -X $folds -I $r >$outf
+    grep "<<" $outf >> $filestem.xsum
+    @ r++
+end
+#	Find the number of cases in the training and test files
+set junk = `grep ^Read $outf`
+@ examples = $junk[2]
+if ( -e $filestem.test ) then
+    @ examples += $junk[9]
+endif
+#	Remove the temporary file and summarize results
+report $examples $folds $repeats $rules <$filestem.xsum >$filestem.res$label
+rm $filestem.xsum
+if ( ! $?details ) rm -f $filestem.o[0-9]*$label

data/ext/c5.0/xval.c ADDED Viewed

@@ -0,0 +1,402 @@
+/*************************************************************************/
+/*									 */
+/*  Copyright 2010 Rulequest Research Pty Ltd.				 */
+/*									 */
+/*  This file is part of C5.0 GPL Edition, a single-threaded version	 */
+/*  of C5.0 release 2.07.						 */
+/*									 */
+/*  C5.0 GPL Edition is free software: you can redistribute it and/or	 */
+/*  modify it under the terms of the GNU General Public License as	 */
+/*  published by the Free Software Foundation, either version 3 of the	 */
+/*  License, or (at your option) any later version.			 */
+/*									 */
+/*  C5.0 GPL Edition is distributed in the hope that it will be useful,	 */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of	 */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU	 */
+/*  General Public License for more details.				 */
+/*									 */
+/*  You should have received a copy of the GNU General Public License	 */
+/*  (gpl.txt) along with C5.0 GPL Edition.  If not, see 		 */
+/*									 */
+/*      <http://www.gnu.org/licenses/>.					 */
+/*									 */
+/*************************************************************************/
+/*************************************************************************/
+/*									 */
+/*	Carry out crossvalidation trials				 */
+/*	--------------------------------				 */
+/*									 */
+/*************************************************************************/
+#include "defns.i"
+#include "extern.i"
+DataRec	*Blocked=Nil;
+float	**Result=Nil;	/* Result[f][0] = tree/ruleset size
+				    [1] = tree/ruleset errors
+				    [2] = tree/ruleset cost  */
+/*************************************************************************/
+/*									 */
+/*	Outer function (differs from xval script)			 */
+/*									 */
+/*************************************************************************/
+void CrossVal()
+/*   --------  */
+{
+    CaseNo	i, Size, Start=0, Next, SaveMaxCase;
+    int		f, SmallTestBlocks, t, SaveTRIALS;
+    ClassNo	c;
+    static CaseNo *ConfusionMat=Nil;
+    static int    SaveFOLDS=0;
+    /*  Check for left-overs after interrupt  */
+    if ( Result )
+    {
+	FreeVector((void **) Result, 0, SaveFOLDS-1);
+	Free(ConfusionMat);
+    }
+    if ( FOLDS > MaxCase+1 )
+    {
+	fprintf(Of, T_FoldsReduced);
+	FOLDS = MaxCase+1;
+    }
+    Result	 = AllocZero((SaveFOLDS = FOLDS), float *);
+    Blocked	 = Alloc(MaxCase+1, DataRec);
+    ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
+    Prepare();
+    SaveMaxCase = MaxCase;
+    SaveTRIALS  = TRIALS;
+    /*  First test blocks may be smaller than the others  */
+    SmallTestBlocks = FOLDS - ((MaxCase+1) % FOLDS);
+    Size = (MaxCase + 1) / FOLDS;
+    ForEach(f, 0, FOLDS-1)
+    {
+	fprintf(Of, "\n\n[ " T_Fold " %d ]\n", f+1);
+	Result[f] = AllocZero(3, float);
+	if ( f == SmallTestBlocks ) Size++;
+	MaxCase = SaveMaxCase - Size;
+	ForEach(i, 0, MaxCase)
+	{
+	    Case[i] = Blocked[Start];
+	    Start = (Start + 1) % (SaveMaxCase + 1);
+	}
+	ConstructClassifiers();
+	/*  Check size (if appropriate) and errors  */
+	if ( TRIALS == 1 )
+	{
+	    Result[f][0] = ( RULES ? RuleSet[0]->SNRules :
+				     TreeSize(Pruned[0]) );
+	    Next = Start;
+	    ForEach(i, 0, Size-1)
+	    {
+		Case[i] = Blocked[Next];
+		c = ( RULES ? RuleClassify(Blocked[Next], RuleSet[0]) :
+			      TreeClassify(Blocked[Next], Pruned[0]) );
+		if ( c != Class(Blocked[Next]) )
+		{
+		    Result[f][1] += 1.0;
+		    if ( MCost )
+		    {
+			Result[f][2] += MCost[c][Class(Blocked[Next])];
+		    }
+		}
+		/*  Add to confusion matrix for target classifier  */
+		ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
+		Next = (Next + 1) % (SaveMaxCase + 1);
+	    }
+	}
+	else
+	{
+	    Result[f][0] = -1;
+	    Next = Start;
+	    Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
+	    ForEach(i, 0, Size-1)
+	    {
+		Case[i] = Blocked[Next];
+		c = BoostClassify(Blocked[Next], TRIALS-1);
+		if ( c != Class(Blocked[Next]) )
+		{
+		    Result[f][1] += 1.0;
+		    if ( MCost )
+		    {
+			Result[f][2] += MCost[c][Class(Blocked[Next])];
+		    }
+		}
+		/*  Add to confusion matrix for target classifier  */
+		ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
+		Next = (Next + 1) % (SaveMaxCase + 1);
+	    }
+	}
+	Result[f][1] = (100.0 * Result[f][1]) / Size;
+	Result[f][2] /= Size;
+	fprintf(Of, T_EvalHoldOut, Size);
+	MaxCase = Size-1;
+	Evaluate(0);
+	/*  Free space used by classifiers  */
+	ForEach(t, 0, MaxTree)
+	{
+	    FreeClassifier(t);
+	}
+	MaxTree = -1;
+	TRIALS = SaveTRIALS;
+    }
+    /*  Print summary of crossvalidation  */
+    MaxCase = SaveMaxCase;
+    Summary();
+    PrintConfusionMatrix(ConfusionMat);
+    /*  Free local storage  */
+    ForEach(i, 0, MaxCase)
+    {
+	Case[i] = Blocked[i];
+    }
+    FreeVector((void **) Result, 0, FOLDS-1);		Result = Nil;
+    Free(Blocked);					Blocked = Nil;
+    Free(ConfusionMat);					ConfusionMat = Nil;
+}
+/*************************************************************************/
+/*                                                                       */
+/*      Prepare data for crossvalidation (similar to xval-prep.c)	 */
+/*                                                                       */
+/*************************************************************************/
+void Prepare()
+/*   -------  */
+{
+    CaseNo	i, First=0, Last, *Temp, Hold, Next=0;
+    ClassNo	Group;
+    Temp = Alloc(MaxCase+1, CaseNo);
+    ForEach(i, 0, MaxCase)
+    {
+	Temp[i] = i;
+    }
+    Shuffle(Temp);
+    /*  Sort into class groups  */
+    while ( First <= MaxCase )
+    {
+	Last = First;
+	Group = Class(Case[Temp[First]]);
+	ForEach(i, First+1, MaxCase)
+	{
+	    if ( Class(Case[Temp[i]]) == Group )
+	    {
+		Last++;
+		Hold = Temp[Last];
+		Temp[Last] = Temp[i];
+		Temp[i] = Hold;
+	    }
+	}
+	First = Last+1;
+    }
+    /*  Organize into stratified blocks  */
+    ForEach(First, 0, FOLDS-1)
+    {
+	for ( i = First ; i <= MaxCase ; i += FOLDS )
+	{
+	    Blocked[Next++] = Case[Temp[i]];
+	}
+    }
+    Free(Temp);
+}
+/*************************************************************************/
+/*                                                                       */
+/*      Shuffle the data cases                                           */
+/*                                                                       */
+/*************************************************************************/
+void Shuffle(int *Vec)
+/*   -------  */
+{
+    int	This=0, Alt, Left=MaxCase+1, Hold;
+    ResetKR(KRInit);
+    while ( Left )
+    {
+	Alt = This + (Left--) * KRandom();
+	Hold 	    = Vec[This];
+	Vec[This++] = Vec[Alt];
+	Vec[Alt]    = Hold;
+    }
+}
+/*************************************************************************/
+/*									 */
+/*	Summarise a crossvalidation					 */
+/*									 */
+/*************************************************************************/
+char
+     *FoldHead[] = { F_Fold, F_UFold, "" };
+void Summary()
+/*   -------  */
+{
+    int		i, f, t;
+    Boolean	PrintSize=true;
+    float	Sum[3], SumSq[3];
+    extern char	*StdP[], *StdPC[], *Extra[], *ExtraC[];
+    for ( i = 0 ; i < 3 ; i++ )
+    {
+	Sum[i] = SumSq[i] = 0;
+    }
+    ForEach(f, 0, FOLDS-1)
+    {
+	if ( Result[f][0] < 1 ) PrintSize = false;
+    }
+    fprintf(Of, "\n\n[ " T_Summary " ]\n\n");
+    ForEach(t, 0, 2)
+    {
+	fprintf(Of, "%s", FoldHead[t]);
+	putc('\t', Of);
+	if ( RULES )
+	{
+	    fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
+	}
+	else
+	{
+	    fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
+	}
+	putc('\n', Of);
+    }
+    putc('\n', Of);
+    ForEach(f, 0, FOLDS-1)
+    {
+	fprintf(Of, "%4d\t", f+1);
+	if ( PrintSize )
+	{
+	    fprintf(Of, " %5g", Result[f][0]);
+	}
+	else
+	{
+	    fprintf(Of, "     *");
+	}
+	fprintf(Of, " %10.1f%%", Result[f][1]);
+	if ( MCost )
+	{
+	    fprintf(Of, "%7.2f", Result[f][2]);
+	}
+	fprintf(Of, "\n");
+	for ( i = 0 ; i < 3 ; i++ )
+	{
+	    Sum[i] += Result[f][i];
+	    SumSq[i] += Result[f][i] * Result[f][i];
+	}
+    }
+    fprintf(Of, "\n  " T_Mean "\t");
+    if ( ! PrintSize )
+    {
+	fprintf(Of, "      ");
+    }
+    else
+    {
+	fprintf(Of, "%6.1f", Sum[0] / FOLDS);
+    }
+    fprintf(Of, " %10.1f%%", Sum[1] / FOLDS);
+    if ( MCost )
+    {
+	fprintf(Of, "%7.2f", Sum[2] / FOLDS);
+    }
+    fprintf(Of, "\n  " T_SE "\t");
+    if ( ! PrintSize )
+    {
+	fprintf(Of, "      ");
+    }
+    else
+    {
+	fprintf(Of, "%6.1f", SE(Sum[0], SumSq[0], FOLDS));
+    }
+    fprintf(Of, " %10.1f%%", SE(Sum[1], SumSq[1], FOLDS));
+    if ( MCost )
+    {
+	fprintf(Of, "%7.2f", SE(Sum[2], SumSq[2], FOLDS));
+    }
+    fprintf(Of, "\n");
+}
+float SE(float sum, float sumsq, int no)
+/*    --  */
+{
+    float mean;
+    mean = sum / no;
+    return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
+}