RubyGems - abstractor - Versions diffs - 1.0.8 - Mend

abstractor 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (346) hide show

data/lib/lingscope/src/lingscope/algorithms/Annotator.java ADDED Viewed

@@ -0,0 +1,51 @@
+package lingscope.algorithms;
+import java.util.List;
+import lingscope.structures.AnnotatedSentence;
+/**
+ *
+ * @author shashank
+ */
+public abstract class Annotator {
+    protected String beginTag;
+    protected String interTag;
+    protected String otherTag;
+    public Annotator(String beginTag, String interTag, String otherTag) {
+        this.beginTag = beginTag;
+        this.interTag = interTag;
+        this.otherTag = otherTag;
+    }
+    public abstract void serializeAnnotator(String trainingFile, String modelFile);
+    public abstract AnnotatedSentence annotateSentence(String sentence, boolean isTokenized);
+    public abstract void loadAnnotator(String modelFile);
+    /**
+     * Checks if the given target phrase is negated in the given sentence. Only
+     * the first word of the target phrase is used
+     * @param sentence
+     * @param isTokenized
+     * @param targetPhrase
+     * @return
+     */
+    public boolean isTargetNegated(String sentence, boolean isTokenized, String targetPhrase) {
+        AnnotatedSentence annotatedSentence = annotateSentence(sentence, isTokenized);
+        String[] targetPhraseWords = targetPhrase.split("\\s+");
+        List<String> words = annotatedSentence.getWords();
+        List<Boolean> areNegated = annotatedSentence.getIsAnnotatedTags();
+        int index = 0;
+        for (String word : words) {
+            if (targetPhraseWords[0].equalsIgnoreCase(word)) {
+                return areNegated.get(index);
+            }
+            ++index;
+        }
+        System.err.println("Phrase not found: " + targetPhrase);
+        return false;
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/BaselineAnnotator.java ADDED Viewed

@@ -0,0 +1,80 @@
+package lingscope.algorithms;
+import generalutils.FileOperations;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+/**
+ * The baseline annotator
+ * @author shashank
+ */
+public abstract class BaselineAnnotator extends Annotator {
+    protected Set<String> phrases;
+    public BaselineAnnotator(String beginTag, String interTag, String otherTag) {
+        super(beginTag, interTag, otherTag);
+        phrases = null;
+    }
+    @Override
+    public void serializeAnnotator(String trainingFile, String modelFile) {
+        try {
+            phrases = new HashSet<String>();
+            List<String> taggedSentences = FileOperations.readFile(trainingFile);
+            for (String taggedSentence : taggedSentences) {
+                phrases.addAll(getCueWords(taggedSentence, beginTag, interTag, otherTag));
+            }
+            FileOperations.writeFile(modelFile, new ArrayList<String>(phrases));
+        } catch (Exception ex) {
+            Logger.getLogger(BaselineAnnotator.class.getName()).log(Level.SEVERE, null, ex);
+        }
+    }
+    @Override
+    public void loadAnnotator(String modelFile) {
+        try {
+            phrases = new HashSet<String>();
+            phrases.addAll(FileOperations.readFile(modelFile));
+        } catch (Exception ex) {
+            Logger.getLogger(BaselineAnnotator.class.getName()).log(Level.SEVERE, null, ex);
+        }
+    }
+    /**
+     * Gets the set of cue word phrases in the given sentence. The given
+     * sentence is tagged according to Abner's specifications
+     * @param abnerTaggedSentence sentence tagged by abner's specification.
+     * @param beginTag the tag to mark the beginning of the cue
+     * @param intermediateTag the tag to mark intermediate portions
+     * @param otherTag the other tag
+     * @return the set of cue words or phrases in the given sentence
+     */
+    public static Set<String> getCueWords(String abnerTaggedSentence, String beginTag, String intermediateTag, String otherTag) {
+        Set<String> cueWordsPhrases = new HashSet<String>(1);
+        String[] elements = abnerTaggedSentence.split(" +");
+        boolean collect = false;
+        StringBuilder collectedPhrase = new StringBuilder();
+        for (String element : elements) {
+            String[] elementTokens = element.split("\\|");
+            String word = elementTokens[0];
+            String tag = elementTokens[1];
+            if (tag.equalsIgnoreCase(beginTag)) {
+                collect = true;
+                collectedPhrase.append(word).append(" ");
+            } else if (tag.equalsIgnoreCase(intermediateTag)) {
+                collectedPhrase.append(word).append(" ");
+            } else if (tag.equalsIgnoreCase(otherTag) && collect) {
+                collect = false;
+                cueWordsPhrases.add(collectedPhrase.toString().trim().toLowerCase());
+                collectedPhrase.delete(0, collectedPhrase.length() - 1);
+            }
+        }
+        return cueWordsPhrases;
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/BaselineCueAnnotator.java ADDED Viewed

@@ -0,0 +1,84 @@
+package lingscope.algorithms;
+import java.util.HashSet;
+import java.util.Set;
+import lingscope.structures.AnnotatedSentence;
+/**
+ *
+ * @author shashank
+ */
+public class BaselineCueAnnotator extends BaselineAnnotator {
+    public BaselineCueAnnotator(String beginTag, String interTag, String otherTag) {
+        super(beginTag, interTag, otherTag);
+    }
+    @Override
+    public AnnotatedSentence annotateSentence(String sentence, boolean isTokenized) {
+        if (phrases == null) {
+            throw new RuntimeException("Annotator has not been loaded");
+        }
+        if (!isTokenized) {
+            sentence = AbnerTokenizer.splitTermsByPunctuation(sentence);
+        }
+        String lcSentence = sentence.toLowerCase();
+        String[] words = sentence.split(" +");
+        int wordsLength = words.length;
+        Set<Integer> addITag = new HashSet<Integer>();
+        Set<Integer> addBTag = new HashSet<Integer>();
+        // Collect all indices where beginning and intermediate tags should
+        // be added
+        for (String phrase : phrases) {
+            if (!lcSentence.contains(phrase)) {
+                continue;
+            }
+            String[] phraseWords = phrase.split(" +");
+            for (int wordCounter = 0; wordCounter < wordsLength; ++wordCounter) {
+                String word = words[wordCounter];
+                if (word.equalsIgnoreCase(phraseWords[0])) {
+                    boolean phraseMatches = true;
+                    for (int j = 0; j < phraseWords.length; ++j) {
+                        int i = j + wordCounter;
+                        if (i == wordsLength) {
+                            phraseMatches = false;
+                            break;
+                        }
+                        if (!phraseWords[j].equalsIgnoreCase(words[i])) {
+                            phraseMatches = false;
+                            break;
+                        }
+                    }
+                    if (phraseMatches) {
+                        addBTag.add(wordCounter);
+                        for (int j = 1; j < phraseWords.length; ++j) {
+                            addITag.add(j + wordCounter);
+                        }
+                    }
+                }
+            }
+        }
+        // Create a tagged sentence. Give preference to beginning tag over
+        // intermediate tag in case they clash
+        StringBuilder taggedSentence = new StringBuilder();
+        for (int i = 0; i < wordsLength; ++i) {
+            String word = words[i];
+            if (addBTag.contains(i)) {
+                taggedSentence.append(" ").append(word).append("|").append(beginTag);
+            } else if (addITag.contains(i)) {
+                taggedSentence.append(" ").append(word).append("|").append(interTag);
+            } else {
+                taggedSentence.append(" ").append(word).append("|").append(otherTag);
+            }
+        }
+        String raw = taggedSentence.substring(1);
+        return new AnnotatedSentence(raw);
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/BaselineScopeAnnotator.java ADDED Viewed

@@ -0,0 +1,101 @@
+package lingscope.algorithms;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import lingscope.structures.AnnotatedSentence;
+/**
+ *
+ * @author shashank
+ */
+public class BaselineScopeAnnotator extends BaselineAnnotator {
+    protected Pattern endPattern;
+    public BaselineScopeAnnotator(String beginTag, String interTag, String otherTag, boolean commaLimit, boolean periodLimit) {
+        super(beginTag, interTag, otherTag);
+        String endPatternString = "";
+        if (commaLimit && periodLimit) {
+            endPatternString = ",|\\.";
+        } else if (commaLimit) {
+            endPatternString = ",";
+        } else if (periodLimit) {
+            endPatternString = "\\.";
+        }
+        endPattern = Pattern.compile(endPatternString); // End tagging if a comma or period is seen
+    }
+    @Override
+    public AnnotatedSentence annotateSentence(String sentence, boolean isTokenized) {
+        if (phrases == null) {
+            throw new RuntimeException("Annotator has not been loaded");
+        }
+        if (!isTokenized) {
+            sentence = AbnerTokenizer.splitTermsByPunctuation(sentence);
+        }
+        String lcSentence = sentence.toLowerCase();
+        String[] words = sentence.split(" +");
+        int wordsLength = words.length;
+        Set<Integer> addBTag = new HashSet<Integer>();
+        // Collect all indices where beginning and intermediate tags should
+        // be added
+        for (String phrase : phrases) {
+            if (!lcSentence.contains(phrase)) {
+                continue;
+            }
+            String[] phraseWords = phrase.split(" +");
+            for (int wordCounter = 0; wordCounter < wordsLength; ++wordCounter) {
+                String word = words[wordCounter];
+                if (word.equalsIgnoreCase(phraseWords[0])) {
+                    boolean phraseMatches = true;
+                    for (int j = 0; j < phraseWords.length; ++j) {
+                        int i = j + wordCounter;
+                        if (i == wordsLength) {
+                            phraseMatches = false;
+                            break;
+                        }
+                        if (!phraseWords[j].equalsIgnoreCase(words[i])) {
+                            phraseMatches = false;
+                            break;
+                        }
+                    }
+                    if (phraseMatches) {
+                        addBTag.add(wordCounter);
+                    }
+                }
+            }
+        }
+        // Create a tagged sentence. Give preference to beginning tag over
+        // intermediate tag in case they clash
+        StringBuilder taggedSentence = new StringBuilder();
+        boolean taggerOn = false;
+        for (int i = 0; i < wordsLength; ++i) {
+            String word = words[i];
+            if (addBTag.contains(i)) {
+                taggedSentence.append(" ").append(word).append("|").append(beginTag);
+                taggerOn = true;
+            } else if (taggerOn) {
+                Matcher endMatch = endPattern.matcher(word);
+                if (endMatch.matches()) {
+                    taggedSentence.append(" ").append(word).append("|").append(otherTag);
+                    taggerOn = false;
+                } else {
+                    taggedSentence.append(" ").append(word).append("|").append(interTag);
+                }
+            } else {
+                taggedSentence.append(" ").append(word).append("|").append(otherTag);
+            }
+        }
+        String raw = taggedSentence.substring(1);
+        return new AnnotatedSentence(raw);
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/CrfAnnotator.java ADDED Viewed

@@ -0,0 +1,45 @@
+package lingscope.algorithms;
+import abner.Tagger;
+import abner.Trainer;
+import java.io.File;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * A CRF based annotator
+ * @author shashank
+ */
+public class CrfAnnotator extends Annotator {
+    private Tagger tagger;
+    public CrfAnnotator(String beginTag, String interTag, String otherTag) {
+        super(beginTag, interTag, otherTag);
+        tagger = null;
+    }
+    @Override
+    public void serializeAnnotator(String trainingFile, String modelFile) {
+        Trainer trainer = new Trainer();
+        trainer.train(trainingFile, modelFile);
+        loadAnnotator(modelFile);
+    }
+    @Override
+    public AnnotatedSentence annotateSentence(String sentence, boolean isTokenized) {
+        if (tagger == null) {
+            throw new RuntimeException("Tagger has not been loaded");
+        }
+        if (!isTokenized) {
+            sentence = AbnerTokenizer.splitTermsByPunctuation(sentence);
+        }
+        String raw = tagger.tagABNER(sentence).trim();
+        return new AnnotatedSentence(raw);
+    }
+    @Override
+    public void loadAnnotator(String modelFile) {
+        tagger = new Tagger(new File(modelFile));
+        tagger.setTokenization(false);
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/NegexAnnotator.java ADDED Viewed

@@ -0,0 +1,52 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package lingscope.algorithms;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import lingscope.algorithms.negex.GenNegEx;
+import lingscope.structures.AnnotatedSentence;
+/**
+ * Annotates negation using Negex
+ * @author shashank
+ */
+public abstract class NegexAnnotator extends Annotator {
+    protected GenNegEx negex;
+    protected ArrayList<String> rules;
+    public NegexAnnotator(String beginTag, String interTag, String otherTag) {
+        super(beginTag, interTag, otherTag);
+        negex = null;
+    }
+    @Override
+    public void serializeAnnotator(String trainingFile, String modelFile) {
+        throw new UnsupportedOperationException("NegEx's serialized version can be downloaded from the internet.");
+    }
+    @Override
+    public void loadAnnotator(String modelFile) {
+        try {
+            negex = new GenNegEx();
+            File ruleFile = new File(modelFile);
+            Scanner sc = new Scanner(ruleFile);
+            rules = new ArrayList();
+            while (sc.hasNextLine()) {
+                rules.add(sc.nextLine());
+            }
+            sc.close();
+        } catch (FileNotFoundException ex) {
+            Logger.getLogger(NegexAnnotator.class.getName()).log(Level.SEVERE, null, ex);
+        }
+    }
+}

data/lib/lingscope/src/lingscope/algorithms/NegexCueAnnotator.java ADDED Viewed

@@ -0,0 +1,26 @@
+package lingscope.algorithms;
+import lingscope.structures.AnnotatedSentence;
+/**
+ *
+ * @author shashank
+ */
+public class NegexCueAnnotator extends NegexAnnotator {
+    public NegexCueAnnotator(String beginTag, String interTag, String otherTag) {
+        super(beginTag, interTag, otherTag);
+    }
+    @Override
+    public AnnotatedSentence annotateSentence(String sentence, boolean isTokenized) {
+        if (negex == null) {
+            throw new RuntimeException("Annotator has not been loaded");
+        }
+        if (!isTokenized) {
+            sentence = AbnerTokenizer.splitTermsByPunctuation(sentence);
+        }
+        String raw = negex.getCue(sentence, rules, beginTag, interTag, otherTag);
+        return new AnnotatedSentence(raw);
+    }
+}