RubyGems - biblicit - Versions diffs - 1.0 → 2.0.3 - Mend

biblicit 1.0 → 2.0.3

Files changed (406) hide show

data/parscit/bin/phOutput2xml.pl ADDED Viewed

@@ -0,0 +1,223 @@
+#!/usr/bin/env perl
+# -*- cperl -*-
+=head1 NAME
+phOutput2xml.pl
+=head1 SYNOPSYS
+ RCS:$Id$
+=head1 DESCRIPTION
+=head1 HISTORY
+ ORIGIN: created from templateApp.pl version 3.4 by Min-Yen Kan <kanmy@comp.nus.edu.sg>
+modified from output2xml.pl for ParsCit.
+ RCS:$Log$
+=cut
+require 5.0;
+use Getopt::Std;
+use strict 'vars';
+# use diagnostics;
+### USER customizable section
+my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
+$tmpfile .= $$ . time;
+if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; }		      # untaint tmpfile variable
+$tmpfile = "/tmp/" . $tmpfile;
+$0 =~ /([^\/]+)$/; my $progname = $1;
+my $outputVersion = "1.0";
+### END user customizable section
+### Ctrl-C handler
+sub quitHandler {
+  print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n";
+  exit;
+}
+### HELP Sub-procedure
+sub Help {
+  print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n";
+  print STDERR "       $progname -v\t\t\t\t[invokes version]\n";
+  print STDERR "       $progname [-qEl] [-r <rankfile> -n <num>] filename(s)...\n";
+  print STDERR "Options:\n";
+  print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
+  print STDERR "\t-E\tTurn OFF error checking\n";
+  print STDERR "\t-l\tEliminate newline tags\n";
+  print STDERR "\t-r <file>\tSVM Ranking output file\n";
+  print STDERR "\t-n <num>\tNumber of choices in both ranking file and input file\n";
+  print STDERR "\n";
+  print STDERR "Will accept input on STDIN as a single file.\n";
+  print STDERR "\n";
+}
+### VERSION Sub-procedure
+sub Version {
+  if (system ("perldoc $0")) {
+    die "Need \"perldoc\" in PATH to print version information";
+  }
+  exit;
+}
+sub License {
+  print STDERR "# Copyright 2009 \251 by Min-Yen Kan\n";
+}
+###
+### MAIN program
+###
+my $cmdLine = $0 . " " . join (" ", @ARGV);
+if ($#ARGV == -1) { 		        # invoked with no arguments, possible error in execution?
+  print STDERR "# $progname info\t\tNo arguments detected, waiting for input on command line.\n";
+  print STDERR "# $progname info\t\tIf you need help, stop this program and reinvoke with \"-h\".\n";
+}
+$SIG{'INT'} = 'quitHandler';
+getopts ('Ehlqr:n:v');
+our ($opt_q, $opt_v, $opt_h, $opt_r, $opt_n, $opt_E, $opt_l);
+# use (!defined $opt_X) for options with arguments
+if (!$opt_q) { License(); }		# call License, if asked for
+if ($opt_v) { Version(); exit(0); }	# call Version, if asked for
+if ($opt_h) { Help(); exit (0); }	# call help, if asked for
+my $errorChecking = (defined $opt_E) ? 0 : 1;
+my $ignoreNewlines = (defined $opt_l) ? 1 : 0;
+my $svmRankFile = (defined $opt_r) ? $opt_r : undef;
+my $rankChoices = (defined $opt_n) ? $opt_n : undef;
+if ((defined $rankChoices && !defined $svmRankFile) ||
+    (!defined $rankChoices && defined $svmRankFile)) {
+  die "# $progname fatal\t\t-n and -r are mutually necessary switches";
+}
+## standardize input stream (either STDIN on first arg on command line)
+my $fh;
+my $filename;
+if ($filename = shift) {
+ NEWFILE:
+  if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
+  open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
+  $fh = "IF";
+} else {
+  $filename = "<STDIN>";
+  $fh = "STDIN";
+}
+# open rank file info, if applicable
+my $rfh;
+my @max = ();
+if (defined $rankChoices && defined $svmRankFile) {
+  open (*RFH, $svmRankFile) || die "# $progname crash\t\tCan't open rankfile \"$svmRankFile\"!";
+  $rfh = "RFH";
+  my $line = 0;
+  my $curLine = 0;
+  my $max = 0;
+  my $maxLine = 0;
+  while (<$rfh>) {
+    chop;
+    $line++;
+    $curLine++;
+    if ($_ > $max) {			   # advance max if applicable
+      $max = $_;
+      $maxLine = $curLine-1;
+    }
+    if ($line % $rankChoices == 0) {	      # save data at fencepost
+      $max[int($line/$rankChoices)-1] = $maxLine;
+#      print "$line $max $maxLine\n";
+      $curLine = 0;					# reset values
+      $max = 0;
+      $maxLine = 0;
+    }
+  }
+  close ($rfh);
+}
+## output XML file for display
+my $line = 0;
+my $buf = "";
+my $buf2 = "";
+my $lastTag = "";
+my $variant = "";
+my $confidence = "1.0";
+print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+print "<?xml-stylesheet href=\"bibxml.xsl\" type=\"text/xsl\" ?>\n";
+print "<file>\n";
+while (<$fh>) {
+  if (/^\# (\d+) ([\.\d]+)/) {
+    $variant = $1;
+    $confidence = $2;
+    next;
+  }
+  elsif (/^\#/) { next; }			       # skip comments
+  if (/^\s*$/) {
+    $buf =~ s/&/&amp;/g;
+    if ($variant eq "") {
+      print "<entry no=\"$line\">\n";
+      if ($ignoreNewlines) {
+	$buf =~ s/\- ([a-z])/$1/g;
+	$buf =~ s/>\s+/>/g;
+	$buf =~ s/\s+</</g;
+	$buf =~ s/\s+$//g;
+	$buf =~ s/^\s+/</g;
+#	$buf =~ s/PARSHED</\n      </g;	# replace with newline and spaces for formatting
+	$buf =~ s/PARSHED</\n</g;	# replace with newline and spaces for formatting
+      }
+      print "<variant no=\"0\" confidence=\"$confidence\">" . $buf . "</$lastTag>\n</variant>\n";
+      print "</entry>\n";
+      $line++;
+    } else {
+      if ($variant eq "0" && $buf2 ne "") {
+	print "<entry no=\"$line svmRank: $max[$line]\">\n" . $buf2 . "  </entry>\n";
+	$buf2 = "";
+	$line++;
+      }
+      $buf2 .= "<variant no=\"$variant\" confidence=\"$confidence\">\n" . $buf . "</$lastTag>\n</variant>\n";
+    }
+    $lastTag = "";
+    $buf = "";
+  } else {
+    chop;
+    my @tokens = split (/\t/);
+    my $token = $tokens[0];
+    my $sys = $tokens[-1];
+    my $gold = $tokens[-2];
+    if ($sys ne $lastTag) {
+      if ($lastTag ne "") { $buf .= "</$lastTag>\n"; }
+      $buf .= "PARSHED<$sys>";
+#      $buf .= "<$sys>";
+    }
+    if ($token eq "+L+" && $ignoreNewlines) {
+      next;
+    }
+    if ($gold ne $sys && $errorChecking) {
+      $buf .= "<error correct=\"$gold\" taggedAs=\"$sys\">$token </error>";
+    } else {
+      $buf .= "$token ";
+    }
+    $lastTag = $sys;
+  }
+}
+# print "  <entry no=\"$line\">\n" . $buf2 . "  </entry>\n";
+print "</file>\n";
+close ($fh);
+if ($filename = shift) {
+  goto NEWFILE;
+}
+###
+### END of main program
+###

data/parscit/bin/redo.parsCit.pl ADDED Viewed

@@ -0,0 +1,105 @@
+#!/usr/bin/env perl
+# -*- cperl -*-
+### USER customizable section
+my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
+$tmpfile .= $$ . time;
+if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; }		      # untaint tmpfile variable
+$tmpfile = "/tmp/" . $tmpfile;
+$0 =~ /([^\/]+)$/; my $progname = $1;
+my $outputVersion = "1.0";
+my $parscitHome = "/home/wing.nus/services/parscit/tools/";
+my $tr2crfppLoc = "$parscitHome/bin/tr2crfpp.pl";
+my $crf_learnLoc = "$ENV{'CRFPP_HOME'}/bin/crf_learn";
+my $crf_testLoc = "$ENV{'CRFPP_HOME'}/bin/crf_test";
+my $conllevalLoc = "$parscitHome/bin/conlleval.pl";
+my $crfTemplateLoc = "$parscitHome/crfpp/traindata/parsCit.template";
+### END user customizable section
+my $trainingFile = $ARGV[0];
+my $folds = $ARGV[1];
+# construct test data
+open (IF, $trainingFile) || die "# $progname fatal\tTraining file cannot be opened \"$trainingFile\"!";
+my $i = 0;
+while (<IF>) {
+  open (OF, ">>$tmpfile.$i.test.src") || die "$progname fatal\tCan't append to file \"$tmpfile.$i.test.src\"!";
+  print OF $_;
+  $i++;
+  $i = $i % $folds;
+}
+close (IF);
+for (my $i = 0; $i < $folds; $i++) {
+  `$tr2crfppLoc $tmpfile.$i.test.src> $tmpfile.$i.test`;
+}
+# construct training data
+for (my $i = 0; $i < $folds; $i++) {
+  for (my $j = 0; $j < $folds; $j++) {
+    if ($j == $i) {next; }
+    else {
+      `cat $tmpfile.$j.test >> $tmpfile.$i.train`;
+    }
+  }
+}
+# train
+for (my $i = 0; $i < $folds; $i++) {
+  my $cmd = "$crf_learnLoc -f 2 -c 3 $crfTemplateLoc $tmpfile.$i.train $tmpfile.$i.model ";
+  print "$cmd\n";
+  system ($cmd);
+}
+# test
+for (my $i = 0; $i < $folds; $i++) {
+  my $cmd = "$crf_testLoc -m $tmpfile.$i.model $tmpfile.$i.test > $tmpfile.$i.out";
+  print "$cmd\n";
+  system ($cmd);
+  my $cmd = "cat $tmpfile.$i.out >> $tmpfile.all.out ";
+  print "$cmd\n";
+  system ($cmd);
+}
+# eval
+#for (my $i = 0; $i < $folds; $i++) {
+#  my $cmd = "$conllevalLoc -r -d \"	\" < $tmpfile.$i.out";
+#  print "$cmd\n";
+#  system ($cmd);
+#}
+my $cmd = "$conllevalLoc -r -d \"	\" < $tmpfile.all.out";
+print "$cmd\n";
+system ($cmd);
+# clean up
+`rm -f $tmpfile*`;
+######################################################################
+# .51
+# on head (first 500 lines of tagged.txt)
+# f=2, c=3 2fold: 92.86
+# f=2, c=3 2fold (more unigram): 93.23
+# 93.19 (with B features)
+# 93.35 without B features
+#
+# on tagged.txt
+# f=2, c=3 2fold cv: 95.24 / 93.99 => 94.61
+# f=2, c=5 2fold cv: => 94.55
+# f=2, c=3 2fold cv = 94.77
+#
+# .48
+# on tagged.txt (cat of all *tagged.txt):
+# normal, 2fold cv: 95.12 / 93.33
+# c=1.5, 2fold cv: 95.14 / 93.38
+# f=2, 2fold cv: 95.29 / 93.93
+# f=2, c=1.5 2fold cv: 95.31 / 93.82
+# f=2, c=3 2fold cv: 95.31 / 93.82
+# f=3, 2fold cv: 95.25 / 93.69
+#
+#
+# a=CRF-L1, f=2 2fold cv: 88.25 / 91.29
+# a=CRF-L1 2fold cv: 80.63 / -- didn't complete
+# a=MIRA 2fold cv: 94.48 / 92.69
+# a=MIRA, f=2 2fold cv: 94.31 / 93.60
+# 100326 .51 normal, 2fold cv, over all data (including iconip)
+# accuracy:  94.83%; precision:  94.83%; recall:  94.83%; FB1:  94.83

data/parscit/bin/sectExtract.pl ADDED Viewed

@@ -0,0 +1,149 @@
+#!/usr/bin/perl -wT
+# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Wed, 03 Mar 2010 00:36:36
+# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
+require 5.0;
+use strict;
+use Getopt::Long;
+# I do not know a better solution to find a lib path in -T mode.
+# So if you know a better solution, I'd be glad to hear.
+# See this http://www.perlmonks.org/?node_id=585299 for why I
+# used the below code
+use FindBin;
+my $path;
+BEGIN
+{
+	if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
+}
+use lib "$path/../lib";
+use SectLabel::Config;
+use SectLabel::Controller;
+### USER customizable section
+$0 =~ /([^\/]+)$/; my $progname = $1;
+my $outputVersion = "1.0";
+### END user customizable section
+sub License {
+  print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
+}
+### HELP Sub-procedure
+sub Help {
+  print STDERR "usage: $progname -h\t[invokes help]\n";
+  print STDERR "       $progname -in inFile [-out outFile -no-xmlInput -no-xmlOutput -log -new]\n";
+  print STDERR "Options:\n";
+  print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
+  print STDERR "\t-out: indicate output file (if not specified output to STDOUT)\n";
+  print STDERR "\t-no-xmlInput: indicate that input is normal text file (default: assume XML file from Omnipage-multiple pages concatenated)\n";
+  print STDERR "\t-no-xmlOutput: do not wrap results in XML format (default: xmlOutput)\n";
+  print STDERR "\t-log: output debugging messages\n";
+}
+my $QUIET = 0;
+my $HELP = 0;
+my $inFile = undef;
+my $outFile = undef;
+my $isXmlInput = 1;
+my $isXmlOutput = 1;
+my $isDebug = 0;
+my $isNew = 0; # if = 1, use processOmniXml_new.pl
+$HELP = 1 unless GetOptions('in=s' => \$inFile,
+			    'out=s' => \$outFile,
+			    'xmlInput!' => \$isXmlInput,
+			    'xmlOutput!' => \$isXmlOutput,
+			    'log' => \$isDebug,
+			    'new' => \$isNew,
+			    'h' => \$HELP,
+			    'q' => \$QUIET);
+if ($HELP || !defined $inFile) {
+  Help();
+  exit(0);
+}
+if (!$QUIET) {
+  License();
+}
+### Untaint ###
+$inFile = untaintPath($inFile);
+my $envPath = $ENV{'PATH'};
+$envPath = untaintPath($envPath);
+$ENV{'PATH'} = $envPath;
+### End untaint ###
+my $modelFile = $isXmlInput? $SectLabel::Config::modelXmlFile : $SectLabel::Config::modelFile;
+$modelFile = "$path/../$modelFile";
+my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
+$configFile = "$path/../$configFile";
+if($isXmlInput){
+  my $xmlInFile = newTmpFile();
+  $xmlInFile = untaintPath($xmlInFile);
+  my $cmd = "$path/sectLabel/";
+  $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
+  $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
+  execute($cmd);
+  $inFile = $xmlInFile;
+}
+my $dictFile = $SectLabel::Config::dictFile;
+$dictFile = "$path/../$dictFile";
+my $funcFile = $SectLabel::Config::funcFile;
+$funcFile = "$path/../$funcFile";
+my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
+if($isXmlInput){
+  unlink($inFile);
+}
+if (defined $outFile) {
+  $outFile = untaintPath($outFile);
+  open (OUT, ">:utf8", $outFile) or die "Could not open $outFile for writing: $!";
+  print OUT $$rXML;
+  close OUT;
+} else {
+  print "$$rXML";
+}
+sub untaintPath {
+  my ($path) = @_;
+  if ( $path =~ /^([-_\/\w\.\d: ]+)$/ ) {
+    $path = $1;
+  } else {
+    die "Bad path $path\n";
+  }
+  return $path;
+}
+sub untaint {
+  my ($s) = @_;
+  if ($s =~ /^([\w \-\@\(\),\.\/<>]+)$/) {
+    $s = $1;               # $data now untainted
+  } else {
+    die "Bad data in $s";  # log this somewhere
+  }
+  return $s;
+}
+sub execute {
+  my ($cmd) = @_;
+  print STDERR "Executing: $cmd\n";
+  $cmd = untaint($cmd);
+  system($cmd);
+}
+sub newTmpFile {
+  my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
+  chomp($tmpFile);
+  return $tmpFile;
+}

data/parscit/bin/sectLabel/README ADDED Viewed

@@ -0,0 +1,110 @@
+README for sectLabel module (v100401)
+CONTENTS
+[0] Directory structure
+[1] Command line Usage
+	[1.1] SectLabel
+	[1.2] GenericSect
+[3] Known issues
+------------------------------------------------------------
+[0] DIRECTORY STRUCTURE
+* processOmniXML.pl: Process Omnipage XML output (concatenated results
+  fromm all pages of a PDF file), and extract text lines together with
+  other XML infos
+Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
+* redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
+* tr2crfpp.pl: Generate SectLabel features for CRF++
+* single2multi.pl: Convert SectLabel training file
+  (e.g. doc/sectLabel.tagged.txt) from single- to multi-line
+  format. This script is called by tr2crfpp.pl
+* genericSectExtract.rb: given a list of section headers of a
+  scientific document in an input file, assign generic headers for the
+  section headers.
+* genericSect/
+------------------------------------------------------------
+[1] COMMAND LINE USAGE
+------------------------------
+[1.1] SectLabel
+* Process Omnipage XML output
+** Usage: processOmniXML.pl -h     [invokes help]
+          processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
+Options:
+        -q      Quiet Mode (don't echo license)
+        -xmlFeature: append XML feature together with text extracted
+        -decode: decode HTML entities and then output, to avoid double
+         entity encoding later
+        -tag tagFile: count XML tags/values for statistics
+        -markup: add factor infos (bold, italic etc) per word using
+         the format "word|||(b|nb)|||(i|ni)", useful in extracting
+         bold/italic phrases
+* Perform stratified cross-validation
+** Usage: redo.sectLabel.pl -h     [invokes help]
+          redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
+Options:
+                -in: training file in the format as in
+                 doc/sectLabel.tagged.txt
+                -dir: output directory, containing all intermediate
+                 files and outputs
+                -n: num of cross validation folds
+                -c: config file to extract features and automatically
+                 generate CRF++ template
+                -p: CRF++ num of CPUs (deault = 6)
+                -iter: CRF++ max iteration (default = 100)
+                -f: CRF++ frequency cut-off (default = 3)
+** E.g.:
+./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
+-dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
+* Extract features
+** Usage: tr2crfpp.pl -h   [invokes help]
+          tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
+Options:
+        -q      Quiet Mode (don't echo license)
+        -in inFile: labeled input file
+        -c configFile: to specify which feature set to use.
+        -out outFile: output file for CRF++ training.
+        -template: to output a template used by CRF++ according to the
+         config file.
+        -single: indicate that each input document is in single-line
+         format (e.g., ./doc/sectLabel.tagged.txt)
+------------------------------
+[1.2] GenericSect
+* Create feature file
+** Usage: ruby extractFeature.rb filePath
+   filePath: path to the labeled data file which lists the actual
+   section headers and their corressponding manually assigned generic
+   section headers (if it exists)
+   syntax: generic_header ||| actual_header
+* Generate generic section headers for a document
+** Usage: ruby genericSectExtract.rb filePath
+   where filePath is a file which lists the actual headers of a
+   document (automaticaly extracted by other module of SectLabel)
+* Perform stratified cross-validation
+** Usage: ruby crossValidation.rb dataFile numFold
+   Note that data file has the format as in doc/genericSect.tagged.txt
+------------------------------------------------------------
+[3] KNOWN ISSUES

data/parscit/bin/sectLabel/README.txt ADDED Viewed

@@ -0,0 +1,110 @@
+README for sectLabel module (v100401)
+CONTENTS
+[0] Directory structure
+[1] Command line Usage
+	[1.1] SectLabel
+	[1.2] GenericSect
+[3] Known issues
+------------------------------------------------------------
+[0] DIRECTORY STRUCTURE
+* processOmniXML.pl: Process Omnipage XML output (concatenated results
+  fromm all pages of a PDF file), and extract text lines together with
+  other XML infos
+Note: the current script is complicated since it mixes 2 things: process Omnipage XML as well as extract XML features. We are planning to break into 2 scripts: 1) simplifyOmniXML.pl (Done!) -- to convert Omnipage into output into internal format, and 2) extractXMLFeatures.pl (TODO) -- to take input as the internal results produced by simplifyOmniXML.pl and generate XML features.
+* redo.sectLabel.pl: Perform stratified cross-validation for SectLabel
+* tr2crfpp.pl: Generate SectLabel features for CRF++
+* single2multi.pl: Convert SectLabel training file
+  (e.g. doc/sectLabel.tagged.txt) from single- to multi-line
+  format. This script is called by tr2crfpp.pl
+* genericSectExtract.rb: given a list of section headers of a
+  scientific document in an input file, assign generic headers for the
+  section headers.
+* genericSect/
+------------------------------------------------------------
+[1] COMMAND LINE USAGE
+------------------------------
+[1.1] SectLabel
+* Process Omnipage XML output
+** Usage: processOmniXML.pl -h     [invokes help]
+          processOmniXML.pl -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]
+Options:
+        -q      Quiet Mode (don't echo license)
+        -xmlFeature: append XML feature together with text extracted
+        -decode: decode HTML entities and then output, to avoid double
+         entity encoding later
+        -tag tagFile: count XML tags/values for statistics
+        -markup: add factor infos (bold, italic etc) per word using
+         the format "word|||(b|nb)|||(i|ni)", useful in extracting
+         bold/italic phrases
+* Perform stratified cross-validation
+** Usage: redo.sectLabel.pl -h     [invokes help]
+          redo.sectLabel.pl -in trainFile -dir outDir -n folds -c configFile [-p numCpus -iter numIter -f freqCutoff]
+Options:
+                -in: training file in the format as in
+                 doc/sectLabel.tagged.txt
+                -dir: output directory, containing all intermediate
+                 files and outputs
+                -n: num of cross validation folds
+                -c: config file to extract features and automatically
+                 generate CRF++ template
+                -p: CRF++ num of CPUs (deault = 6)
+                -iter: CRF++ max iteration (default = 100)
+                -f: CRF++ frequency cut-off (default = 3)
+** E.g.:
+./bin/sectLabel/redo.sectLabel.pl -in ./doc/sectLabelXml.tagged.txt
+-dir testRedoDir -n 10 -c ./resources/sectLabel/sectLabel.configXml
+* Extract features
+** Usage: tr2crfpp.pl -h   [invokes help]
+          tr2crfpp.pl -in inFile -c configFile -out outFile [-template -single]
+Options:
+        -q      Quiet Mode (don't echo license)
+        -in inFile: labeled input file
+        -c configFile: to specify which feature set to use.
+        -out outFile: output file for CRF++ training.
+        -template: to output a template used by CRF++ according to the
+         config file.
+        -single: indicate that each input document is in single-line
+         format (e.g., ./doc/sectLabel.tagged.txt)
+------------------------------
+[1.2] GenericSect
+* Create feature file
+** Usage: ruby extractFeature.rb filePath
+   filePath: path to the labeled data file which lists the actual
+   section headers and their corressponding manually assigned generic
+   section headers (if it exists)
+   syntax: generic_header ||| actual_header
+* Generate generic section headers for a document
+** Usage: ruby genericSectExtract.rb filePath
+   where filePath is a file which lists the actual headers of a
+   document (automaticaly extracted by other module of SectLabel)
+* Perform stratified cross-validation
+** Usage: ruby crossValidation.rb dataFile numFold
+   Note that data file has the format as in doc/genericSect.tagged.txt
+------------------------------------------------------------
+[3] KNOWN ISSUES