RubyGems - abstractor - Versions diffs - 1.0.8 - Mend

abstractor 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (346) hide show

data/lib/lingscope/src/lingscope/algorithms/negex/Sorter.java ADDED Viewed

@@ -0,0 +1,36 @@
+package lingscope.algorithms.negex;
+import java.util.*;
+// Utility class to sort the negation rules by length in descending order.
+// Rules need to be matched by longest first because there is overlap between the
+// RegEx of the rules.
+//
+// Author: Imre Solti
+// solti@u.washington.edu
+// Date: 10/20/2008
+public class Sorter {
+    public List<String> sortRules(List<String> unsortedRules) {
+        try {
+            // Sort the negation rules by length to make sure
+            // that longest rules match first.
+            String temp = "";
+            for (int i = 0; i < unsortedRules.size() - 1; i++) {
+                for (int j = i + 1; j < unsortedRules.size(); j++) {
+                    String a = (String) unsortedRules.get(i);
+                    String b = (String) unsortedRules.get(j);
+                    if (a.trim().length() < b.trim().length()) {
+                        // Sorting into descending order by lebgth of string.
+                        unsortedRules.set(i, b);
+                        unsortedRules.set(j, a);
+                    }
+                }
+            }
+        } catch (Exception e) {
+            System.out.println(e);
+        }
+        return unsortedRules;
+    }
+}

data/lib/lingscope/src/lingscope/drivers/AnnotatedFilesMerger.java ADDED Viewed

@@ -0,0 +1,61 @@
+package lingscope.drivers;
+import java.util.ArrayList;
+import java.util.List;
+import lingscope.io.AnnotatedSentencesIO;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Merges two annotated files. Useful to merge a words scope file with a POS cue
+ * file
+ * @author shashank
+ */
+public class AnnotatedFilesMerger {
+    /**
+     * Merges the given wordsSentence and the given tagsSentence
+     * @param wordsSentence
+     * @param tagsSentence
+     * @return
+     */
+    public static AnnotatedSentence merge(AnnotatedSentence wordsSentence, AnnotatedSentence tagsSentence) {
+        List<String> words = wordsSentence.getWords();
+        List<String> tags = tagsSentence.getTags();
+        int numTokens = words.size();
+        if (tags.size() != numTokens) {
+            System.err.println("Skipping non-equal length sentences");
+            System.err.println("\tSentence 1: " + wordsSentence.getRawText());
+            System.err.println("\tSentence 2: " + tagsSentence.getRawText());
+            return null;
+        }
+        StringBuilder mergedSentence = new StringBuilder();
+        for (int j = 0; j < numTokens; ++j) {
+            mergedSentence.append(" ").append(words.get(j)).append("|").append(tags.get(j));
+        }
+        return new AnnotatedSentence(mergedSentence.substring(1));
+    }
+    /**
+     *
+     * @param args
+     * 0 - file 1: the file from which words will be taken
+     * 1 - file 2: the file from which tags will be taken
+     * 2 - output file path
+     */
+    public static void main(String[] args) {
+        List<AnnotatedSentence> wordsSentences = AnnotatedSentencesIO.read(args[0]);
+        List<AnnotatedSentence> tagsSentences = AnnotatedSentencesIO.read(args[1]);
+        int numSentences = tagsSentences.size();
+        List<AnnotatedSentence> mergedSentences = new ArrayList<AnnotatedSentence>(numSentences);
+        for (int i = 0; i < numSentences; ++i) {
+            AnnotatedSentence wordsSentence = wordsSentences.get(i);
+            AnnotatedSentence tagsSentence = tagsSentences.get(i);
+            AnnotatedSentence mergedSentence = merge(wordsSentence, tagsSentence);
+            if (mergedSentence == null) {
+                continue;
+            }
+            mergedSentences.add(mergedSentence);
+        }
+        AnnotatedSentencesIO.write(args[2], mergedSentences);
+    }
+}

data/lib/lingscope/src/lingscope/drivers/AnnotationComparerDriver.java ADDED Viewed

@@ -0,0 +1,22 @@
+package lingscope.drivers;
+import lingscope.algorithms.AnnotationComparer;
+/**
+ * Compares annotations between a gold and test file
+ * @author shashank
+ */
+public class AnnotationComparerDriver {
+    /**
+     *
+     * @param args
+     * 0 - gold file
+     * 1 - test file
+     */
+    public static void main(String[] args) {
+        AnnotationComparer comparer = new AnnotationComparer(10);
+        comparer.compareAnnotationFiles(args[0], args[1]);
+        comparer.printStats();
+    }
+}

data/lib/lingscope/src/lingscope/drivers/BaselineDriver.java ADDED Viewed

@@ -0,0 +1,45 @@
+package lingscope.drivers;
+import java.util.List;
+import lingscope.algorithms.Annotator;
+import lingscope.algorithms.BaselineScopeAnnotator;
+import lingscope.io.AnnotatedSentencesIO;
+import lingscope.structures.AnnotatedSentence;
+/**
+ *
+ * @author shashank
+ */
+public class BaselineDriver {
+    /**
+     *
+     * @param args
+     * 0 - Annotator type ("cue" or "scope")
+     * 1 - Serialized annotator file
+     * 2 - Input file
+     * 3 - Output file
+     * 4 - if annotator type is "scope", then should scope be limited by commas
+     * 5 - if annotator type is "scope", then should scope be limited by periods
+     * 6 - (boolean) does the input file contain annotated sentence (true) or
+     * not (false)
+     */
+    public static void main(String[] args) {
+        Annotator annotator;
+        if (args[0].equalsIgnoreCase("scope")) {
+            annotator = new BaselineScopeAnnotator(SentenceTagger.SCOPE_START,
+                    SentenceTagger.SCOPE_INTER, SentenceTagger.OTHER,
+                    Boolean.parseBoolean(args[4]), Boolean.parseBoolean(args[5]));
+        } else {
+            annotator = SentenceTagger.getAnnotator("baseline", args[0]);
+        }
+        annotator.loadAnnotator(args[1]);
+        boolean isAnnotated = Boolean.parseBoolean(args[6]);
+        List<String> inputSentences = SentenceTagger.getStringList(args[2], isAnnotated);
+        List<AnnotatedSentence> outputSentences = SentenceTagger.annotateSentences(annotator, inputSentences, isAnnotated);
+        AnnotatedSentencesIO.write(args[3], outputSentences);
+    }
+}

data/lib/lingscope/src/lingscope/drivers/CrfDriver.java ADDED Viewed

@@ -0,0 +1,31 @@
+package lingscope.drivers;
+import java.util.List;
+import lingscope.algorithms.Annotator;
+import lingscope.io.AnnotatedSentencesIO;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Driver to mark scope or cue in a file
+ * @author shashank
+ */
+public class CrfDriver {
+    /**
+     *
+     * @param args
+     * 0 - Annotator type ("cue" or "scope")
+     * 1 - Serialized annotator file
+     * 2 - Input file
+     * 3 - Output file
+     * 4 - (boolean) does the input file contain annotated sentence (true) or
+     * not (false)
+     */
+    public static void main(String[] args) {
+        Annotator annotator = SentenceTagger.getAnnotator("crf", args[0]);
+        annotator.loadAnnotator(args[1]);
+        boolean isAnnotated = Boolean.parseBoolean(args[4]);
+        List<String> inputSentences = SentenceTagger.getStringList(args[2], isAnnotated);
+        List<AnnotatedSentence> outputSentences = SentenceTagger.annotateSentences(annotator, inputSentences, isAnnotated);
+        AnnotatedSentencesIO.write(args[3], outputSentences);
+    }
+}

data/lib/lingscope/src/lingscope/drivers/CueAndPosFilesMerger.java ADDED Viewed

@@ -0,0 +1,86 @@
+package lingscope.drivers;
+import generalutils.FileOperations;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import lingscope.io.AnnotatedSentencesIO;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Merges two files, one containing pos tags and the other containing
+ * annotated cues
+ * @author shashank
+ */
+public class CueAndPosFilesMerger {
+    public static AnnotatedSentence merge(AnnotatedSentence cueSentence, String posSentence, boolean replaceTags) {
+        String[] posTags = posSentence.split("\\s+");
+        List<String> crfTags = cueSentence.getTags();
+        List<String> words = cueSentence.getWords();
+        List<Boolean> tagStatusList = cueSentence.getIsAnnotatedTags();
+        StringBuilder mergedSentence = new StringBuilder();
+        int numWords = posTags.length;
+        for (int j = 0; j < numWords; ++j) {
+            mergedSentence.append(" ");
+            String posTag = posTags[j];
+            String word = words.get(j);
+            String crfTag = crfTags.get(j);
+            boolean tagStatus = tagStatusList.get(j);
+            if (tagStatus) {
+                if (replaceTags) {
+                    mergedSentence.append("CUE|");
+                } else {
+                    mergedSentence.append(word).append("|");
+                }
+            } else {
+                mergedSentence.append(posTag).append("|");
+            }
+            mergedSentence.append(crfTag);
+        }
+        AnnotatedSentence mergedAnnotatedSentence = new AnnotatedSentence(mergedSentence.substring(1));
+        return mergedAnnotatedSentence;
+    }
+    /**
+     * Merges the cueSentences and posSentences
+     * @param cueSentences
+     * @param posSentences
+     * @param replaceTags
+     * @return
+     */
+    public static List<AnnotatedSentence> merge(List<AnnotatedSentence> cueSentences, List<String> posSentences, boolean replaceTags) {
+        List<AnnotatedSentence> mergedSentences = new ArrayList<AnnotatedSentence>(cueSentences.size());
+        int numSentences = posSentences.size();
+        for (int i = 0; i < numSentences; ++i) {
+            AnnotatedSentence cueSentence = cueSentences.get(i);
+            String posSentence = posSentences.get(i);
+            AnnotatedSentence mergedAnnotatedSentence = merge(cueSentence, posSentence, replaceTags);
+            mergedSentences.add(mergedAnnotatedSentence);
+        }
+        return mergedSentences;
+    }
+    /**
+     *
+     * @param args
+     * 0 - cue input file
+     * 1 - pos input file
+     * 2 - replace cue with custom tag 'CUE' (true) or leave it as it is (false)
+     * 3 - merged file output path
+     */
+    public static void main(String[] args) {
+        boolean replaceTags = Boolean.parseBoolean(args[2]);
+        List<AnnotatedSentence> cueSentences = AnnotatedSentencesIO.read(args[0]);
+        List<String> posSentences = null;
+        try {
+            posSentences = FileOperations.readFile(args[1]);
+        } catch (Exception ex) {
+            Logger.getLogger(CueAndPosFilesMerger.class.getName()).log(Level.SEVERE, null, ex);
+            System.exit(1);
+        }
+        AnnotatedSentencesIO.write(args[3], merge(cueSentences, posSentences, replaceTags));
+    }
+}

data/lib/lingscope/src/lingscope/drivers/ModelTrainer.java ADDED Viewed

@@ -0,0 +1,39 @@
+package lingscope.drivers;
+import lingscope.algorithms.Annotator;
+/**
+ * Driver to train a model file. The training data will have to be provided.
+ * @author shashank
+ */
+public class ModelTrainer {
+    /**
+     * Prints the usage for the model trainer
+     */
+    public static void usage() {
+        System.out.println("Usage:\njava lingscope.drivers.ModelTrainer (cue|scope) (crf|baseline|negex) training_data_file file_where_model_will_be_saved");
+    }
+    /**
+     *
+     * @param args
+     * 0 - Annotator type ("cue" or "scope")
+     * 1 - Model type ("crf", "baseline" or "negex")
+     * 2 - File from which training data will be read
+     * 2 - File where the model will be saved
+     */
+    public static void main(String[] args) {
+        if (args.length != 4) {
+            usage();
+            System.exit(0);
+        }
+        Annotator annotator = SentenceTagger.getAnnotator(args[1], args[0]);
+        if (annotator == null) {
+            usage();
+            System.exit(1);
+        }
+        annotator.serializeAnnotator(args[2], args[3]);
+    }
+}

data/lib/lingscope/src/lingscope/drivers/NegexDriver.java ADDED Viewed

@@ -0,0 +1,32 @@
+package lingscope.drivers;
+import java.util.List;
+import lingscope.algorithms.Annotator;
+import lingscope.io.AnnotatedSentencesIO;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Runs the annotators
+ * @author shashank
+ */
+public class NegexDriver {
+    /**
+     *
+     * @param args
+     * 0 - Annotator type ("cue" or "scope")
+     * 1 - Serialized annotator file
+     * 2 - Input file
+     * 3 - Output file
+     * 4 - (boolean) does the input file contain annotated sentence (true) or
+     * not (false)
+     */
+    public static void main(String[] args) {
+        Annotator annotator = SentenceTagger.getAnnotator("negex", args[0]);
+        annotator.loadAnnotator(args[1]);
+        boolean isAnnotated = Boolean.parseBoolean(args[4]);
+        List<String> inputSentences = SentenceTagger.getStringList(args[2], isAnnotated);
+        List<AnnotatedSentence> outputSentences = SentenceTagger.annotateSentences(annotator, inputSentences, isAnnotated);
+        AnnotatedSentencesIO.write(args[3], outputSentences);
+    }
+}

data/lib/lingscope/src/lingscope/drivers/PosTaggerDriver.java ADDED Viewed

@@ -0,0 +1,62 @@
+package lingscope.drivers;
+import generalutils.FileOperations;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import lingscope.algorithms.PosTagger;
+/**
+ * Driver for the Part of Speech tagger. Use this to tag all sentences in a
+ * given file with part of speech tags
+ * @author shashank
+ */
+public class PosTaggerDriver {
+    private static PosTagger posTagger = null;
+    /**
+     * Gets the equivalent POS sentence for the given sentenceToTag
+     * @param grammerFile file containing POS model
+     * @param sentenceToTag sentence for which POS tags will be returned
+     * @param is
+     * @return a string where the words from sentenceToTag are replaced with
+     * corresponding part of speech tags
+     */
+    public static String getTaggedSentence(String grammerFile, String sentenceToTag, boolean isTokenized) {
+        if (posTagger == null) {
+            posTagger = new PosTagger(grammerFile);
+        }
+        List<String> posTags = posTagger.replaceWordsWithPos(sentenceToTag, isTokenized);
+        StringBuilder posSentence = new StringBuilder();
+        for (String posTag : posTags) {
+            posSentence.append(" ").append(posTag);
+        }
+        return posSentence.substring(1);
+    }
+    /**
+     *
+     * @param args
+     * 0 - file containing the part of speech model
+     * 1 - input file
+     * 2 - output file
+     * 3 - (boolean) does the input file contain annotated sentence (true) or
+     * not (false)
+     */
+    public static void main(String[] args) {
+        String grammarFile = args[0];
+        List<String> inputSentences = SentenceTagger.getStringList(args[1], Boolean.parseBoolean(args[3]));
+        List<String> outputSentences = new ArrayList<String>(inputSentences.size());
+        for (String inputSentence : inputSentences) {
+            String outputSentence = getTaggedSentence(grammarFile, inputSentence, Boolean.parseBoolean(args[3]));
+            outputSentences.add(outputSentence);
+        }
+        try {
+            FileOperations.writeFile(args[2], outputSentences);
+        } catch (Exception ex) {
+            Logger.getLogger(PosTaggerDriver.class.getName()).log(Level.SEVERE, null, ex);
+        }
+    }
+}

data/lib/lingscope/src/lingscope/drivers/SentencePosTagger.java ADDED Viewed

@@ -0,0 +1,89 @@
+package lingscope.drivers;
+import generalutils.FileOperations;
+import java.util.List;
+import lingscope.algorithms.Annotator;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Use this sentence tagger when using a model that tags POS
+ * @author shashank
+ */
+public class SentencePosTagger {
+    public static void usage() {
+        System.out.println("java lingscope.drivers.SentencePosTagger cue_tagging_model "
+                + "cue_tagger_type(baseline|crf|negex) "
+                + "replace_cue_with_custom_tag(true|false) scope_tagging_model "
+                + "pos_model_file sentence_to_tag");
+        System.out.println("\tSaved model for negation can be obtained from http://negscope.askhermes.org/");
+        System.out.println("\tSaved model for speculation can be obtained from http://hedgescope.askhermes.org/");
+        System.out.println("\tSaved model for NegEx can be obtained from http://code.google.com/p/negex/downloads/list");
+        System.out.println("\tSaved pos_model_file can be obtained from http://hedgescope.askhermes.org/");
+    }
+    /**
+     *
+     * @param args
+     * 0 - cue tagging model
+     * 1 - cue tagger type (baseline, crf or negex)
+     * 2 - replace cue words with custom tag CUE (true) or not (false)
+     * 3 - crf pos-based scope tagging model
+     * 4 - POS model file
+     * 5 - sentence to tag
+     */
+    public static void main(String[] args) {
+        if (args[0].equalsIgnoreCase("help")) {
+            usage();
+            System.exit(0);
+        } else if (args.length < 6) {
+            usage();
+            System.exit(1);
+        }
+        Annotator cueAnnotator = SentenceTagger.getAnnotator(args[1], "cue");
+        cueAnnotator.loadAnnotator(args[0]);
+        Annotator scopeAnnotator = SentenceTagger.getAnnotator("crf", "scope");
+        scopeAnnotator.loadAnnotator(args[3]);
+        String sentence = args[5];
+        String grammarFile = args[4];
+        if ("file".equalsIgnoreCase(sentence)) {
+            String sentencesFile = args[6];
+            try {
+                List<String> sentences = FileOperations.readFile(sentencesFile);
+                for (String sentenceText : sentences) {
+                    tagSentence(sentenceText, grammarFile,
+                            Boolean.parseBoolean(args[2]), cueAnnotator,
+                            scopeAnnotator);
+                }
+            } catch (Exception ex) {
+                ex.printStackTrace(System.err);
+            }
+        } else {
+            tagSentence(sentence, grammarFile, Boolean.parseBoolean(args[2]),
+                    cueAnnotator, scopeAnnotator);
+        }
+    }
+    /**
+     * Tags the given sentence
+     * @param sentence the text of the sentence to tag
+     * @param grammarFile path to the Stanford part of speech model file
+     * @param replaceCueWords if true, cue words will be replaced with custom
+     * tag 'CUE'
+     * @param cueAnnotator the {@link Annotator} object to identify negation or
+     * hedge cue in the sentence
+     * @param scopeAnnotator the {@link Annotator} object to identify negation
+     * or hedge scope in the sentence
+     */
+    public static void tagSentence(String sentence, String grammarFile,
+            boolean replaceCueWords, Annotator cueAnnotator, Annotator scopeAnnotator) {
+        String posSentence = PosTaggerDriver.getTaggedSentence(grammarFile, sentence, false);
+        AnnotatedSentence cueTaggedSentence = cueAnnotator.annotateSentence(sentence, false);
+        AnnotatedSentence posCueMerged = CueAndPosFilesMerger.merge(cueTaggedSentence, posSentence, replaceCueWords);
+        AnnotatedSentence scopeMarkedSentence = scopeAnnotator.annotateSentence(posCueMerged.getSentenceText(), true);
+        AnnotatedSentence scopeWordsMarkedSentence = AnnotatedFilesMerger.merge(cueTaggedSentence, scopeMarkedSentence);
+        System.out.println(scopeWordsMarkedSentence.getRawText());
+    }
+}