RubyGems - opener-tree-tagger - Versions diffs - 2.0.0 - Mend

opener-tree-tagger 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/README.md +150 -0
data/bin/opener-tree-tagger-daemon +7 -0
data/bin/opener-tree-tagger-server +11 -0
data/bin/tree-tagger +7 -0
data/config.ru +5 -0
data/core/dutch.map.treetagger.kaf.csv +40 -0
data/core/english.map.treetagger.kaf.csv +36 -0
data/core/french.map.treetagger.kaf.csv +33 -0
data/core/german.map.treetagger.kaf.csv +52 -0
data/core/italian.map.treetagger.kaf.csv +38 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
data/core/spanish.map.treetagger.kaf.csv +75 -0
data/core/token_matcher.py +82 -0
data/core/tt_from_kaf_to_kaf.py +215 -0
data/exec/tree-tagger.rb +9 -0
data/ext/hack/Rakefile +13 -0
data/ext/hack/support.rb +38 -0
data/lib/opener/tree_tagger.rb +69 -0
data/lib/opener/tree_tagger/cli.rb +69 -0
data/lib/opener/tree_tagger/public/markdown.css +284 -0
data/lib/opener/tree_tagger/server.rb +16 -0
data/lib/opener/tree_tagger/version.rb +5 -0
data/lib/opener/tree_tagger/views/index.erb +96 -0
data/lib/opener/tree_tagger/views/result.erb +15 -0
data/opener-tree-tagger.gemspec +35 -0
data/pre_build_requirements.txt +1 -0
metadata +197 -0

data/core/spanish.map.treetagger.kaf.csv ADDED

@@ -0,0 +1,75 @@
+ACRNM	O	acronym (ISO, CEI)
+ADJ	G	Adjectives (mayores, mayor)
+ADV	A	Adverbs (muy, demasiado, c�mo)
+ALFP	O	Plural letter of the alphabet (As/Aes, bes)
+ALFS	O	Singular letter of the alphabet (A, b)
+ART	D	Articles (un, las, la, unas)
+BACKSLASH	O	backslash (\)
+CARD	O	Cardinals
+CC	C	Coordinating conjunction (y, o)
+CCAD	C	Adversative coordinating conjunction (pero)
+CCNEG	C	Negative coordinating conjunction (ni)
+CM	O	comma (,)
+CODE	O	Alphanumeric code
+COLON	O	colon (:)
+CQUE	C	que (as conjunction)
+CSUBF	C	Subordinating conjunction that introduces finite clauses (apenas)
+CSUBI	C	Subordinating conjunction that introduces infinite clauses (al)
+CSUBX	C	Subordinating conjunction underspecified for subord-type (aunque)
+DASH	O	dash (-)
+DM	Q	Demonstrative pronouns (�sas, �se, esta)
+DOTS	O	POS tag for "..."
+FO	O	Formula
+FS	O	Full stop punctuation marks
+INT	Q	Interrogative pronouns (qui�nes, cu�ntas, cu�nto)
+ITJN	O	Interjection (oh, ja)
+LP	O	left parenthesis ("(", "[")
+NC	N	Common nouns (mesas, mesa, libro, ordenador)
+NEG	O	Negation
+NMEA	N	measure noun (metros, litros)
+NMON	N	month name
+NP	R	Proper nouns
+ORD	O	Ordinals (primer, primeras, primera)
+PAL	O	Portmanteau word formed by a and el
+PDEL	O	Portmanteau word formed by de and el
+PE	O	Foreign word
+PERCT	O	percent sign (%)
+PNC	O	Unclassified word
+PPC	Q	Clitic personal pronoun (le, les)
+PPO	Q	Possessive pronouns (mi, su, sus)
+PPX	Q	Clitics and personal pronouns (nos, me, nosotras, te, s�)
+PREP	O	Negative preposition (sin)
+PREP	O	Preposition
+PREP/DEL O	Complex preposition "despu�s del"
+QT	O	quotation symbol (" ' `)
+QU	O	Quantifiers (sendas, cada)
+REL	Q	Relative pronouns (cuyas, cuyo)
+RP	O	right parenthesis (")", "]")
+SE	O	Se (as particle)
+SEMICOLON	O	semicolon (;)
+SLASH	O	slash (/)
+SYM	O	Symbols
+UMMX	N	measure unit (MHz, km, mA)
+VCLIger V	clitic gerund verb
+VCLIinf	V	clitic infinitive verb
+VCLIfin	V	clitic finite verb
+VEadj	V	Verb estar. Past participle
+VEfin	V	Verb estar. Finite
+VEger	V	Verb estar. Gerund
+VEinf	V	Verb estar. Infinitive
+VHadj	V	Verb haber. Past participle
+VHfin	V	Verb haber. Finite
+VHger	V	Verb haber. Gerund
+VHinf	V	Verb haber. Infinitive
+VLadj	V	Lexical verb. Past participle
+VLfin	V	Lexical verb. Finite
+VLger	V	Lexical verb. Gerund
+VLinf	V	Lexical verb. Infinitive
+VMadj	V	Modal verb. Past participle
+VMfin	V	Modal verb. Finite
+VMger	V	Modal verb. Gerund
+VMinf	V	Modal verb. Infinitive
+VSadj	V	Verb ser. Past participle
+VSfin	V	Verb ser. Finite
+VSger	V	Verb ser. Gerund
+VSinf	V	Verb ser. Infinitive

data/core/token_matcher.py ADDED

@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+#####
+# 4-Mar-2013 : modified order of rules to check first if there is a merge and then if it is an extra token
+#  becuase of this case, where can be both:  [ .. . ]  [ . . . ]
+def add_match(d,id_new,id_ref):
+  if id_new in d:
+    d[id_new].append(id_ref)
+  else:
+    d[id_new]=[id_ref]
+def token_matcher(l_ref,l_new,super_d):
+  debug = False
+  if debug:
+    print l_ref
+    print l_new
+  if len(l_new)==0:
+    return
+  else:
+    token_ref, id_ref = l_ref[0]
+    token_new, id_new = l_new[0]
+    if token_ref == token_new:
+      if debug: print 'Matching ',l_ref[0],l_new[0]
+      if debug: print 'A',l_ref[0],l_new[0]
+      add_match(super_d,id_new,id_ref)
+      token_matcher(l_ref[1:],l_new[1:],super_d)
+    else:
+      if token_ref.startswith(token_new)  : ##There was an split
+        if debug: print 'D'
+        aux = (token_ref[len(token_new):],id_ref)
+        l_ref[0]=aux
+        add_match(super_d,id_new,id_ref)
+        token_matcher(l_ref,l_new[1:],super_d)
+      elif token_new.startswith(token_ref)  : ##There was a merge
+        if debug: print 'E'
+        aux = (token_new[len(token_ref)+1:],id_new)
+        l_new[0]=aux
+        add_match(super_d,id_new,id_ref)
+        token_matcher(l_ref[1:],l_new,super_d)
+      elif len(l_new)>1 and l_new[1][0]==token_ref: ## There is an extra token in l_new
+        if debug: print 'B',l_new[1][0],token_ref
+        token_matcher(l_ref[0:],l_new[1:],super_d)
+      elif len(l_ref)>1 and l_ref[1][0] == token_new: ## There is an extra token in l_ref
+        if debug: print 'C',l_ref[1:],l_new[0:]
+        token_matcher(l_ref[1:],l_new[0:],super_d)
+      else: ## Imposible matching
+        if debug: print 'F'
+        if debug: print 'Impossible match of ',l_new[0],l_ref[0]
+        token_matcher(l_ref[1:],l_new[1:],super_d)
+if __name__ == '__main__':
+  l1 = []
+  s1 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau (Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
+  s1 = 'Th is is a very simple example'
+  for n,t in enumerate(s1.split(' ')):
+    l1.append((t,'id'+str(n)))
+  l2 = []
+  #s2 = 'Beatrix Wilhelmina Armgard van Oranje -Nassau ( Baarn , 31 januari 1938 ) is sinds 30 april 1980 koningin van het Koninkrijk der Nederlanden'
+  s2 = 'This is a very sim ple example'
+  for n,t in enumerate(s2.split(' ')):
+    l2.append((t,'id'+str(n)))
+  super_d = {}
+  token_matcher(l1,l2,super_d)
+  print l1
+  print l2
+  print super_d

data/core/tt_from_kaf_to_kaf.py ADDED

@@ -0,0 +1,215 @@
+#!/usr/bin/env python
+#-*- coding: utf8 *-*
+__version__ = '1.2 4-Mar-2013'
+## Last changes
+# 1-Mar-2013 --> now it works with UTF-8 !!!
+# 4-Mar-2013 --> added code for including the element in the linguistic processores header
+# 5-Mar-2013 --> language is not a parameter, is read from the input KAF
+# 9-dec-2013 --> the postagger avoids 2 terms with the same tokenid span, like 's --> '   and  s
+# 11-mar-2014 --> fixed problem when merge with token_matcher
+###################################
+import sys
+import os
+this_folder    = os.path.dirname(os.path.realpath(__file__))
+# This updates the load path to ensure that the local site-packages directory
+# can be used to load packages (e.g. a locally installed copy of lxml).
+sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
+import operator
+import time
+import getopt
+import string
+import subprocess
+import lxml
+from lxml import etree
+from lxml.etree import ElementTree as ET, Element as EL, PI
+from VUKafParserPy.KafParserMod import KafParser
+from token_matcher import token_matcher
+if not os.environ.get('TREE_TAGGER_PATH'):
+  print>>sys.stderr,"TREE_TAGGER_PATH environment variable not found. Please set the full path to your tree tagger in the TREE_TAGGER_PATH environent variable."
+  sys.exit(-1)
+complete_path_to_treetagger = os.environ.get('TREE_TAGGER_PATH')
+def loadMapping(mapping_file):
+  map={}
+  filename = os.path.join(os.path.dirname(__file__),mapping_file)
+  fic = open(filename)
+  for line in fic:
+    fields = line.strip().split()
+    map[fields[0]] = fields[1]
+  fic.close()
+  return map
+if __name__=='__main__':
+  this_folder = os.path.dirname(os.path.realpath(__file__))
+  if sys.stdin.isatty():
+      print>>sys.stderr,'Input stream required.'
+      print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
+      sys.exit(-1)
+  time_stamp = True
+  try:
+    opts, args = getopt.getopt(sys.argv[1:],"l:",["no-time"])
+    for opt, arg in opts:
+      if opt == "--no-time":
+        time_stamp = False
+  except getopt.GetoptError:
+    pass
+  input_kaf = KafParser(sys.stdin)
+  my_lang = input_kaf.getLanguage()
+  if my_lang == 'en':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english-utf8'
+    mapping_file = this_folder +'/english.map.treetagger.kaf.csv'
+    model = 'English models'
+  elif my_lang == 'nl':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
+    mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
+    model = 'Dutch models'
+  elif my_lang == 'de':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-german-utf8'
+    mapping_file = this_folder +'/german.map.treetagger.kaf.csv'
+    model = 'German models'
+  elif my_lang == 'fr':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-french-utf8'
+    mapping_file = this_folder +'/french.map.treetagger.kaf.csv'
+    model = 'French models'
+  elif my_lang == 'it':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-italian-utf8'
+    mapping_file = this_folder +'/italian.map.treetagger.kaf.csv'
+    model = 'Italian models'
+  elif my_lang == 'es':
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-spanish-utf8'
+    mapping_file = this_folder +'/spanish.map.treetagger.kaf.csv'
+    model = 'Spanish models'
+  else: ## Default is dutch
+    treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-dutch-utf8'
+    mapping_file = this_folder +'/dutch.map.treetagger.kaf.csv'
+    model = 'Dutch models'
+  map_tt_to_kaf = loadMapping(mapping_file)
+  ## Create the input text for
+  reference_tokens = []
+  sentences = []
+  prev_sent='-200'
+  aux = []
+  for word, sent_id, w_id in input_kaf.getTokens():
+    if sent_id != prev_sent:
+      if len(aux) != 0:
+        sentences.append(aux)
+        aux = []
+    aux.append((word,w_id))
+    prev_sent = sent_id
+  if len(aux)!=0:
+    sentences.append(aux)
+  for sentence in sentences:
+    #print>>sys.stderr,'Input sentnece:',sentence
+    text = ' '.join(t.encode('utf-8') for t,_ in sentence)
+    if not os.path.isfile(treetagger_cmd):
+      print>>sys.stderr, "Can't find the proper tree tagger command: " +treetagger_cmd
+      raise IOError(treetagger_cmd)
+    try:
+      tt_proc = subprocess.Popen(treetagger_cmd,stdin=subprocess.PIPE, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    except Exception as e:
+      print>>sys.stderr,str(e)
+    out, err = tt_proc.communicate(text)
+    #print>>sys.stderr,'Output treetagger',out
+    data = {}
+    new_tokens = []
+    for n,line in enumerate(out.splitlines()):
+      line = line.decode('utf-8')
+      my_id='t_'+str(n)
+      token,pos,lemma = line.strip().split('\t')
+      pos_kaf = map_tt_to_kaf.get(pos,'O')
+      if lemma=='<unknown>':
+        lemma=token
+        pos+=' unknown_lemma'
+      if pos_kaf in ['N','R','G','V','A','O']:
+        type_term = 'open'
+      else:
+        type_term = 'close'
+      data[my_id] = (token,pos_kaf,lemma,type_term,pos)
+      new_tokens.append((token,my_id))
+    #tt_proc.terminate()
+    mapping_tokens = {}
+    #print
+    #print 'SENTENCE',sentence
+    #print 'New=tokens',new_tokens
+    token_matcher(sentence,new_tokens,mapping_tokens)
+    #print mapping_tokens
+    #print
+    new_terms = []
+    terms_for_token = {}
+    for token_new, id_new in new_tokens:
+      token,pos_kaf,lemma,type_term,pos = data[id_new]
+      ref_tokens = mapping_tokens[id_new]
+      span = []
+      #print token_new, id_new, ref_tokens
+      for ref_token in ref_tokens:
+        span.append(ref_token)
+        if ref_token in terms_for_token:
+          terms_for_token[ref_token].append(id_new)
+        else:
+          terms_for_token[ref_token] = [id_new]
+      new_terms.append((id_new,type_term,pos_kaf,pos,lemma,span))
+    #print terms_for_token
+    not_use = set()
+    for id_new,type_term,pos_kaf,pos,lemma,span in new_terms:
+      #print not_use
+      #print id_new
+      if id_new not in not_use:
+        new_lemma = ''
+        for tokenid in span:
+          if len(terms_for_token[tokenid]) > 1:
+            new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower()
+            not_use |= set(terms_for_token[tokenid])
+        if new_lemma != '':
+          lemma = new_lemma
+        ###############
+        ele_term = EL('term',attrib={'tid':id_new,
+                                      'type':type_term,
+                                      'pos':pos_kaf,
+                                      'morphofeat':pos,
+                                      'lemma':lemma})
+        ele_span = EL('span')
+        for ref_token in span:
+          eleTarget = EL('target',attrib={'id':ref_token})
+          ele_span.append(eleTarget)
+        ele_term.append(ele_span)
+        input_kaf.addElementToLayer('terms', ele_term)
+  ##End for each sentence
+  input_kaf.addLinguisticProcessor('TreeTagger_from_kaf '+model,'1.0', 'term', time_stamp)
+  input_kaf.saveToFile(sys.stdout)

data/exec/tree-tagger.rb ADDED

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+require 'opener/daemons'
+require_relative '../lib/opener/tree_tagger'
+options = Opener::Daemons::OptParser.parse!(ARGV)
+daemon  = Opener::Daemons::Daemon.new(Opener::TreeTagger, options)
+daemon.start

data/ext/hack/Rakefile ADDED

@@ -0,0 +1,13 @@
+require 'rake'
+require_relative 'support'
+desc 'Verifies the requirements'
+task :requirements do
+  verify_requirements
+end
+task :default => :requirements do
+  # path = File.join(PYTHON_SITE_PACKAGES, 'pre_install')
+  #
+  # pip_install(PRE_INSTALL_REQUIREMENTS, path)
+end

data/ext/hack/support.rb ADDED

@@ -0,0 +1,38 @@
+require 'opener/build-tools'
+include Opener::BuildTools::Requirements
+include Opener::BuildTools::Python
+include Opener::BuildTools::Files
+# Directory where packages will be installed to.
+PYTHON_SITE_PACKAGES = File.expand_path(
+  '../../../core/site-packages',
+  __FILE__
+)
+# Directory containing the temporary files.
+TMP_DIRECTORY = File.expand_path('../../../tmp', __FILE__)
+# Path to the pip requirements file used to install requirements before
+# packaging the Gem.
+PRE_BUILD_REQUIREMENTS = File.expand_path(
+  '../../../pre_build_requirements.txt',
+  __FILE__
+)
+# Path to the pip requirements file used to install requirements upon Gem
+# installation.
+PRE_INSTALL_REQUIREMENTS = File.expand_path(
+  '../../../pre_install_requirements.txt',
+  __FILE__
+)
+##
+# Verifies the requirements to install thi Gem.
+#
+def verify_requirements
+  require_executable('python')
+  require_version('python', python_version, '2.6.0')
+  require_executable('pip')
+  require_version('pip', pip_version, '1.3.1')
+end

data/lib/opener/tree_tagger.rb ADDED

@@ -0,0 +1,69 @@
+require 'open3'
+require 'optparse'
+require_relative 'tree_tagger/version'
+require_relative 'tree_tagger/cli'
+module Opener
+  class TreeTagger
+    attr_reader :options, :args
+    ##
+    # Hash containing the default options to use.
+    #
+    # @return [Hash]
+    #
+    DEFAULT_OPTIONS = {
+      :args => []
+    }.freeze
+    ##
+    # @param [Hash] options
+    #
+    # @option options [Array] :args Collection of arbitrary arguments to pass
+    #  to the underlying kernel.
+    #
+    def initialize(options = {})
+      @args    = options.delete(:args) || []
+      @options = DEFAULT_OPTIONS.merge(options)
+    end
+    def run(input)
+      stdout, stderr, process = capture(input)
+      if process.success?
+        STDERR.puts(stderr) unless stderr.empty?
+      else
+        abort stderr
+      end
+      return stdout, stderr, process
+    end
+    def capture(input)
+      Open3.capture3(*command.split(" "), :stdin_data=>input)
+    end
+    def command
+      return "#{adjust_python_path} python -E -OO #{kernel} #{args.join(' ')}"
+    end
+    protected
+    ##
+    # @return [String]
+    #
+    def adjust_python_path
+      site_packages =  File.join(core_dir, 'site-packages')
+      "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
+    end
+    def core_dir
+      File.expand_path("../../core", File.dirname(__FILE__))
+    end
+    def kernel
+      File.join(core_dir,'/tt_from_kaf_to_kaf.py')
+    end
+  end
+end