RubyGems - treat - Versions diffs - 0.1.1 → 0.1.2 - Mend

treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

data/INSTALL +1 -0
data/README +3 -0
data/TODO +14 -26
data/bin/INFO +1 -1
data/lib/treat/buildable.rb +10 -11
data/lib/treat/categories.rb +8 -6
data/lib/treat/category.rb +7 -2
data/lib/treat/delegatable.rb +64 -56
data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
data/lib/treat/detectors/language/language_detector.rb +2 -1
data/lib/treat/detectors/language/what_language.rb +2 -2
data/lib/treat/detectors.rb +3 -0
data/lib/treat/entities/entity.rb +1 -1
data/lib/treat/entities.rb +9 -10
data/lib/treat/exception.rb +3 -1
data/lib/treat/extractors/named_entity/abner.rb +1 -1
data/lib/treat/extractors/named_entity/stanford.rb +2 -2
data/lib/treat/extractors/time/chronic.rb +2 -2
data/lib/treat/extractors/time/nickel.rb +2 -2
data/lib/treat/extractors/topic_words/lda.rb +2 -2
data/lib/treat/extractors.rb +12 -9
data/lib/treat/feature.rb +6 -1
data/lib/treat/formatters/cleaners/html.rb +1 -1
data/lib/treat/formatters.rb +8 -8
data/lib/treat/group.rb +11 -10
data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
data/lib/treat/inflectors.rb +8 -21
data/lib/treat/kernel.rb +120 -0
data/lib/treat/languages/arabic.rb +14 -0
data/lib/treat/languages/categories.rb +5 -0
data/lib/treat/languages/chinese.rb +12 -0
data/lib/treat/languages/english/categories.rb +23 -0
data/lib/treat/{resources → languages/english}/tags.rb +127 -184
data/lib/treat/languages/english.rb +33 -0
data/lib/treat/languages/french.rb +17 -0
data/lib/treat/languages/german.rb +17 -0
data/lib/treat/languages/italian.rb +14 -0
data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
data/lib/treat/languages/xinhua.rb +12 -0
data/lib/treat/languages.rb +91 -0
data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
data/lib/treat/lexicalizers/tag/brill.rb +2 -1
data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
data/lib/treat/lexicalizers.rb +1 -1
data/lib/treat/object.rb +6 -0
data/lib/treat/processors/parsers/enju.rb +3 -2
data/lib/treat/processors/parsers/stanford.rb +15 -12
data/lib/treat/processors/segmenters/punkt.rb +1 -1
data/lib/treat/processors/segmenters/stanford.rb +7 -5
data/lib/treat/processors/segmenters/tactful.rb +1 -1
data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
data/lib/treat/processors/tokenizers/stanford.rb +7 -5
data/lib/treat/visitable.rb +2 -1
data/lib/treat.rb +105 -54
data/test/tc_entity.rb +5 -0
data/test/tc_resources.rb +5 -5
data/test/tc_treat.rb +1 -2
data/test/tests.rb +2 -1
metadata +63 -64
data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
data/lib/treat/resources/categories.rb +0 -18
data/lib/treat/resources/delegates.rb +0 -96
data/lib/treat/resources/dependencies.rb +0 -0
data/lib/treat/resources/edges.rb +0 -8
data/lib/treat/resources/formats.rb +0 -23
data/lib/treat/resources/languages.rb +0 -86
data/lib/treat/resources.rb +0 -10
data/lib/treat/utilities.rb +0 -127

data/lib/treat/{resources → languages/english}/tags.rb RENAMED Viewed

@@ -1,60 +1,10 @@
-# encoding: UTF-8
 module Treat
-  module Resources
-    class Tags
+  module Languages
+    class English
       ClawsC5 = 0
       Brown = 1
       Penn = 2
-      Enju = 3
-      PTBWordTagToCategory = {
-        'CC' =>  :conjunction, # Coordinating conjunction
-        'CD' => :number, # Cardinal number
-        'DT' => :determiner, # Determiner
-        'DET' => :determiner, # Determiner
-        'EX' => :determiner, # Existential there
-        'FW' =>  :foreign, # Foreign word
-        'IN' =>  :preposition, # Preposition or subordinating conjunction
-        'JJ' =>  :adjective, # Adjective
-        'JJR' => :adjective, # Adjective, comparative
-        'JJS' => :adjective, # Adjective, superlative
-        'LS' =>  :list, # List item marker
-        'MD' =>  :modal, # Modal
-        'NN' =>  :noun, # Noun, singular or mass
-        'NNS' =>  :noun, # Noun, plural
-        'NNP' =>  :noun, # Proper noun, singular
-        'NNPS' =>  :noun, # Proper noun, plural
-        'PDT' =>  :determiner, # Predeterminer
-        'POS' =>  :determiner, # Possessive ending
-        'PRP' =>  :pronoun, # Personal pronoun
-        'PRP$' =>  :pronoun, # Possessive pronoun,
-        'PRPS' =>  :determiner, # Possessive determiner
-        'RB' =>  :adverb, # Adverb
-        'RBR' =>  :adverb, # Adverb, comparative
-        'RBS' =>  :adverb, # Adverb, superlative
-        'RP' =>  :particle, # Particle
-        'SYM' =>  :symbol, # Symbol
-        'TO' =>  :to, # to
-        'UH' =>  :interjection, # Interjection
-        'VB' =>  :verb, # Verb, base form
-        'VBD' =>  :verb, # Verb, past tense
-        'VBG' =>  :verb, # Verb, gerund or present participle
-        'VBN' =>  :verb, # Verb, past participle
-        'VBP' =>  :verb, # Verb, non-3rd person singular present
-        'VBZ' =>  :verb, # Verb, 3rd person singular present
-        'WDT' =>  :determiner, # Wh-determiner
-        'WP' =>  :pronoun, # Wh-pronoun
-        'WP$' =>  :pronoun, # Possessive wh-pronoun
-        'WRB' =>  :adverb, # Wh-adverb
-        ')' => :punctuation, # Right bracket
-        '(' => :punctuation,  # Left bracket
-        '.' => :punctuation, # Period
-        '\'\'' => :symbol, # Quote
-        ',' => :punctuation,
-        ';' => :punctuation
-      }
       PTBClauseTagDescription = [
         ['S', 'Simple declarative clause'],
@@ -89,141 +39,135 @@ module Treat
       ]
       PTBWordTagDescription = [
-=begin
-CC - Coordinating conjunction
-CD - Cardinal number
-DT - Determiner
-EX - Existential there
-FW - Foreign word
-IN - Preposition or subordinating conjunction
-JJ - Adjective
-JJR - Adjective, comparative
-JJS - Adjective, superlative
-LS - List item marker
-MD - Modal
-NN - Noun, singular or mass
-NNS - Noun, plural
-NNP - Proper noun, singular
-NNPS - Proper noun, plural
-PDT - Predeterminer
-POS - Possessive ending
-PRP - Personal pronoun
-PRP$ - Possessive pronoun (prolog version PRP-S)
-RB - Adverb
-RBR - Adverb, comparative
-RBS - Adverb, superlative
-RP - Particle
-SYM - Symbol
-TO - to
-UH - Interjection
-VB - Verb, base form
-VBD - Verb, past tense
-VBG - Verb, gerund or present participle
-VBN - Verb, past participle
-VBP - Verb, non-3rd person singular present
-VBZ - Verb, 3rd person singular present
-WDT - Wh-determiner
-WP - Wh-pronoun
-WP$ - Possessive wh-pronoun (prolog version WP-S)
-WRB - Wh-adverb
-=end
+        ['CC', 'Coordinating conjunction'],
+        ['CD', 'Cardinal number'],
+        ['DT', 'Determiner'],
+        ['EX', 'Existential there'],
+        ['FW', 'Foreign word'],
+        ['IN', 'Preposition or subordinating conjunction'],
+        ['JJ', 'Adjective'],
+        ['JJR', 'Adjective, comparative'],
+        ['JJS', 'Adjective, superlative'],
+        ['LS', 'List item marker'],
+        ['MD', 'Modal'],
+        ['NN', 'Noun, singular or mass'],
+        ['NNS', 'Noun, plural'],
+        ['NNP', 'Proper noun, singular'],
+        ['NNPS', 'Proper noun, plural'],
+        ['PDT', 'Predeterminer'],
+        ['POS', 'Possessive ending'],
+        ['PRP', 'Personal pronoun'],
+        ['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
+        ['RB', 'Adverb'],
+        ['RBR', 'Adverb, comparative'],
+        ['RBS', 'Adverb, superlative'],
+        ['RP', 'Particle'],
+        ['SYM', 'Symbol'],
+        ['TO', 'to'],
+        ['UH', 'Interjection'],
+        ['VB', 'Verb, base form'],
+        ['VBD', 'Verb, past tense'],
+        ['VBG', 'Verb, gerund or present participle'],
+        ['VBN', 'Verb, past participle'],
+        ['VBP', 'Verb, non 3rd person singular present'],
+        ['VBZ', 'Verb, 3rd person singular present'],
+        ['WDT', 'Wh-determiner'],
+        ['WP', 'Wh-pronoun'],
+        ['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
+        ['WRB', 'Wh-adverb']
       ]
       BrownWordTagDescription = [
-=begin
-Tag	Description	Examples
+        ['.',	'sentence closer	. ; ? !'],
+        ['(',	'left parent']	 ,
+        [')',	'right parent'],
+        ['*',	'not'],
+        ['--',	'dash'],
+        [',',	'comma'],
+        [':',	'colon'],
+        ['ABL', 'pre-qualifier	quite, rather'],
+        ['ABN', 'pre-quantifier	half, all'],
+        ['ABX', 'pre-quantifier	both'],
+        ['AP', 'post-determiner	many, several, next'],
+        ['AT', 'article	a, the, no'],
+        ['BE', 'be	 '],
+        ['BED', 'were	 '],
+        ['BEDZ', 'was	 '],
+        ['BEG', 'being	 '],
+        ['BEM', 'am	 '],
+        ['BEN', 'been	 '],
+        ['BER', 'are, art	 '],
+        ['BEZ', 'is	 '],
+        ['CC', 'coordinating conjunction	and, or'],
+        ['CD', 'cardinal numeral	one, two, 2, etc.'],
+        ['CS', 'subordinating conjunction	if, although'],
+        ['DO', 'do	 '],
+        ['DOD', 'did	 '],
+        ['DOZ', 'does	 '],
+        ['DT', 'singular determiner	this, that'],
+        ['DTI', 'singular or plural determiner/quantifier	some, any'],
+        ['DTS', 'plural determiner	these, those'],
+        ['DTX', 'determiner/double conjunction	either'],
+        ['EX', 'existentil there	 '],
+        ['FW', 'foreign word (hyphenated before regular tag)	 '],
+        ['HL', 'word occurring in headline (hyphenated after regular tag)	 '],
+        ['HV', 'have	 '],
+        ['HVD', 'had (past tense)	 '],
+        ['HVG', 'having	 '],
+        ['HVN', 'had (past participle)	 '],
+        ['HVZ', 'has	 '],
+        ['IN', 'preposition	 '],
+        ['JJ', 'adjective	 '],
+        ['JJR', 'comparative adjective	 '],
+        ['JJS', 'semantically superlative adjective	 chief, top'],
+        ['JJT', 'morphologically superlative adjective	biggest'],
+        ['MD', 'modal auxiliary	can, should, will'],
+        ['NC', 'cited word (hyphenated after regular tag)	 '],
+        ['NN', 'singular or mass noun	 '],
+        ['NN$', 'possessive singular noun	 '],
+        ['NNS', 'plural noun	 '],
+        ['NNS$', 'possessive plural noun	 '],
+        ['NP', 'proper noun or part of name phrase	 '],
+        ['NP$', 'possessive proper noun	 '],
+        ['NPS', 'plural proper noun	 '],
+        ['NPS$', 'possessive plural proper noun	 '],
+        ['NR', 'adverbial noun	home, today, west'],
+        ['NRS', 'plural adverbial noun'],
+        ['OD', 'ordinal numeral	first, 2nd'],
+        ['PN', 'nominal pronoun	everybody, nothing'],
+        ['PN$', 'possessive nominal pronoun	 '],
+        ['PP$', 'possessive personal pronoun	my, our'],
+        ['PP$$', 'second (nominal) possessive pronoun	mine, ours'],
+        ['PPL', 'singular reflexive/intensive personal pronoun	myself'],
+        ['PPLS', 'plural reflexive/intensive personal pronoun	ourselves'],
+        ['PPO', 'objective personal pronoun	me, him, it, them'],
+        ['PPS', '3rd. singular nominative pronoun	he, she, it, one'],
+        ['PPSS', 'other nominative personal pronoun	I, we, they, you'],
+        ['QL', 'qualifier	very, fairly'],
+        ['QLP', 'post-qualifier	enough, indeed'],
+        ['RB', 'adverb	 '],
+        ['RBR', 'comparative adverb	 '],
+        ['RBT', 'superlative adverb	 '],
+        ['RN', 'nominal adverb	here then, indoors	 '],
+        ['RP', 'adverb/particle	about, off, up'],
+        ['TL', 'word occurring in title (hyphenated after regular tag)'],
+        ['TO', 'infinitive marker to	 '],
+        ['UH', 'interjection, exclamation	 '],
+        ['VB', 'verb, base form	 '],
+        ['VBD', 'verb, past tense	 '],
+        ['VBG', 'verb, present participle/gerund	 '],
+        ['VBN', 'verb, past participle	 '],
+        ['VBZ', 'verb, 3rd. singular present	 '],
+        ['WDT', 'wh- determiner	what, which'],
+        ['WP$', 'possessive wh- pronoun	whose'],
+        ['WPO', 'objective wh- pronoun	whom, which, that'],
+        ['WPS', 'nominative wh- pronoun	who, which, that'],
+        ['WQL', 'wh- qualifier	how'],
+        ['WRB', 'wh- adverb	how, where, when']
-.	sentence closer	. ; ? !
-(	left paren
-)	right paren
-*	not, n't
---	dash
-,	comma
-:	colon
-ABL	pre-qualifier	quite, rather
-ABN	pre-quantifier	half, all
-ABX	pre-quantifier	both
-AP	post-determiner	many, several, next
-AT	article	a, the, no
-BE	be
-BED	were
-BEDZ	was
-BEG	being
-BEM	am
-BEN	been
-BER	are, art
-BEZ	is
-CC	coordinating conjunction	and, or
-CD	cardinal numeral	one, two, 2, etc.
-CS	subordinating conjunction	if, although
-DO	do
-DOD	did
-DOZ	does
-DT	singular determiner	this, that
-DTI	singular or plural determiner/quantifier	some, any
-DTS	plural determiner	these, those
-DTX	determiner/double conjunction	either
-EX	existentil there
-FW	foreign word (hyphenated before regular tag)
-HL	word occurring in headline (hyphenated after regular tag)
-HV	have
-HVD	had (past tense)
-HVG	having
-HVN	had (past participle)
-HVZ	has
-IN	preposition
-JJ	adjective
-JJR	comparative adjective
-JJS	semantically superlative adjective	 chief, top
-JJT	morphologically superlative adjective	biggest
-MD	modal auxiliary	can, should, will
-NC	cited word (hyphenated after regular tag)
-NN	singular or mass noun
-NN$	possessive singular noun
-NNS	plural noun
-NNS$	possessive plural noun
-NP	proper noun or part of name phrase
-NP$	possessive proper noun
-NPS	plural proper noun
-NPS$	possessive plural proper noun
-NR	adverbial noun	home, today, west
-NRS	plural adverbial noun
-OD	ordinal numeral	first, 2nd
-PN	nominal pronoun	everybody, nothing
-PN$	possessive nominal pronoun
-PP$	possessive personal pronoun	my, our
-PP$$	second (nominal) possessive pronoun	mine, ours
-PPL	singular reflexive/intensive personal pronoun	myself
-PPLS	plural reflexive/intensive personal pronoun	ourselves
-PPO	objective personal pronoun	me, him, it, them
-PPS	3rd. singular nominative pronoun	he, she, it, one
-PPSS	other nominative personal pronoun	I, we, they, you
-QL	qualifier	very, fairly
-QLP	post-qualifier	enough, indeed
-RB	adverb
-RBR	comparative adverb
-RBT	superlative adverb
-RN	nominal adverb	here then, indoors
-RP	adverb/particle	about, off, up
-TL	word occurring in title (hyphenated after
- 	regular tag)
-TO	infinitive marker to
-UH	interjection, exclamation
-VB	verb, base form
-VBD	verb, past tense
-VBG	verb, present participle/gerund
-VBN	verb, past participle
-VBZ	verb, 3rd. singular present
-WDT	wh- determiner	what, which
-WP$	possessive wh- pronoun	whose
-WPO	objective wh- pronoun	whom, which, that
-WPS	nominative wh- pronoun	who, which, that
-WQL	wh- qualifier	how
-WRB	wh- adverb	how, where, when
-=end
       ]
+      # A description of Enju categories.
       EnjuCatDescription = [
         ['ADJ',	'Adjective'],
         ['ADV',	'Adverb'],
@@ -330,7 +274,7 @@ WRB	wh- adverb	how, where, when
         'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
         'Noun, adverbial', ['NN0', 'NR', 'NN'],
         'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
-        'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
+        'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
         'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
         'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
         'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
@@ -387,7 +331,6 @@ WRB	wh- adverb	how, where, when
         'Symbol, alphabetical', ['ZZ0', '', ''],
         'Symbol, list item', ['', '', 'LS']
       ]
     end
   end
 end

data/lib/treat/languages/english.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Treat
+  module Languages
+    class English
+      require 'treat/languages/english/tags'
+      require 'treat/languages/english/categories'
+      Extractors = {
+        time: [:chronic],
+        topics: [:reuters],
+        topic_words: [:lda],
+        key_sentences: [:topics_frequency]
+      }
+      Processors = {
+        chunkers: [:txt],
+        parsers: [:enju, :stanford],
+        segmenters: [:tactful, :punkt, :stanford],
+        tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
+      }
+      Lexicalizers = {
+        category: [:from_tag],
+        linkages: [:naive],
+        synsets: [:wordnet, :rita_wn],
+        tag: [:brill, :lingua, :stanford]
+      }
+      Inflectors = {
+        conjugations: [:linguistics],
+        declensions: [:linguistics, :english],
+        stem: [:porter_c, :porter, :uea],
+        ordinal_words: [:linguistics],
+        cardinal_words: [:linguistics]
+      }
+    end
+  end
+end

data/lib/treat/languages/french.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Treat
+  module Languages
+    class French
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {
+        tag: [:stanford]
+      }
+      Processors = {
+        chunkers: [:txt],
+        parsers: [:stanford],
+        segmenters: [:tactful, :punkt, :stanford],
+        tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
+      }
+    end
+  end
+end

data/lib/treat/languages/german.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Treat
+  module Languages
+    class German
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {
+        tag: [:stanford]
+      }
+      Processors = {
+        chunkers: [:txt],
+        parsers: [:stanford],
+        segmenters: [:tactful, :punkt, :stanford],
+        tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
+      }
+    end
+  end
+end

data/lib/treat/languages/italian.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Treat
+  module Languages
+    class Italian
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {}
+      Processors = {
+        chunkers: [:txt],
+        segmenters: [:tactful, :punkt, :stanford],
+        tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
+      }
+    end
+  end
+end

data/lib/treat/{resources/languages.txt → languages/list.txt} RENAMED Viewed

File without changes

data/lib/treat/languages/xinhua.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Treat
+  module Languages
+    class Xinhua
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {}
+      Processors = {
+        parsers: [:stanford]
+      }
+    end
+  end
+end

data/lib/treat/languages.rb ADDED Viewed

@@ -0,0 +1,91 @@
+module Treat
+  # This module provides linguistic resources
+  # for the Treat library, including information
+  # about language codes, the functions available
+  # for each language, and the different tags used
+  # to markup that language.
+  module Languages
+    Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
+    ISO639_1 = 1
+    ISO639_2 = 2
+    # Describe a language code (ISO-639-1 or ISO-639-2)
+    # or its full text description in full French or English.
+    def self.describe(lang, desc_lang = :en)
+      raise "Must provide a non-nil language identifier to describe." if lang.nil?
+      lang = find(lang).to_s
+      if [:en, :eng, :english, :anglais].include?(desc_lang)
+        l = @@english_full.key(lang)
+      elsif [:fr, :fra, :french, :french].include?(desc_lang)
+        l = @@french_full.key(lang)
+      else
+        raise Treat::Exception,
+        "Unknown language to describe: #{desc_lang}."
+      end
+      not_found(lang) if l.nil?
+      l.intern
+    end
+    # Raise an error message when a language code
+    # or description is not found and suggest
+    # possible misspellings.
+    def self.not_found(lang)
+      msg = "Language '#{lang}' does not exist."
+      all = @@iso639_2.keys + @@iso639_1.keys +
+      @@english_full.keys + @@french_full.keys
+      msg += did_you_mean?(all, lang)
+      raise Treat::Exception, msg
+    end
+    # Return the class representing a language.
+    def self.get(lang)
+      const_get(Treat::Languages.describe(lang).to_s.capitalize)
+    end
+    # Find a language by ISO-639-1 or ISO-639-2 code
+    # or full name (in English or French) and return
+    # the ISO-639-1 or ISO-639-2 language code as a
+    # lowercase identifier.
+    def self.find(lang, rc = ISO639_2)
+      raise "Must provide a non-nil language identifier to describe." if lang.nil?
+      get_languages
+      lang = lang.to_s.downcase
+      if @@iso639_1.has_key?(lang)
+        return :"#{lang}" if rc == ISO639_1
+        return :"#{@@iso639_1[lang]}" if rc == ISO639_2
+      elsif @@iso639_2.has_key?(lang)
+        return :"#{lang}" if rc == ISO639_2
+        return :"#{@@iso639_2[lang]}" if rc == ISO639_1
+      elsif @@english_full.has_key?(lang)
+        return :"#{@@english_full[lang]}" if rc == ISO639_2
+        return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
+      elsif @@french_full.has_key?(lang)
+        return :"#{@@french_full[lang]}" if rc == ISO639_2
+        return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
+      else
+        not_found(lang)
+      end
+    end
+    @@loaded = false
+    # Get the languages from the dictionary.
+    def self.get_languages
+      return if @@loaded
+      @@iso639_1 = {}; @@iso639_2 = {};
+      @@english_full = {}; @@french_full = {}
+      languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
+      languages.each do |language|
+        iso639_2, iso639_1, english_desc, french_desc =
+        language.split(',')
+        @@iso639_1[iso639_1] = iso639_2
+        @@iso639_2[iso639_2] = iso639_1
+        unless english_desc.nil?
+          english_desc.strip.downcase.split('|').each do |l|
+            @@english_full[l.downcase.strip] = iso639_2
+          end
+        end
+        unless french_desc.nil?
+          french_desc.strip.downcase.split('|').each do |l|
+            @@french_full[l.downcase.strip] = iso639_2
+          end
+        end
+      end
+      @@loaded = true
+    end
+  end
+end

data/lib/treat/lexicalizers/category/from_tag.rb CHANGED Viewed

@@ -4,24 +4,36 @@ module Treat
       # A class that detects the category of a word from its tag,
       # using the default tagger for the language of the entity.
       class FromTag
+        DefaultOptions = { tagger: nil }
         # Find the category of the current entity.
         # Options:
         # :tagger => (Symbol) force the use of a tagger.
         # :tag_to_cat => (Hash) a list of categories for each possible tag.
         def self.category(entity, options = {})
-          if options.empty?
-            options = {
-              tagger: nil,
-              tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
-            }
-          end
+          options = DefaultOptions.merge(options)
           tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
-          cat = options[:tag_to_cat][tag]
+          lang = Treat::Languages.get(entity.language)
+          cat = lang::WordTagToCategory[tag]
           if cat.nil?
             warn "Category not found for tag #{tag}."
             :unknown
           else
-            cat
+            if cat.size == 1
+              return cat[0]
+            else
+              if entity.has?(:tag_set)
+                if cat[entity.tag_set]
+                  return cat[entity.tag_set]
+                else
+                  raise Treat::Exception,
+                  "The specified tag set (#{entity.tag_set})" +
+                  " does not contain the tag #{tag}."
+                end
+              else
+                raise Treat::Exception,
+                "No information can be found regarding which tag set to use."
+              end
+            end
           end
         end
       end

data/lib/treat/lexicalizers/synsets/rita_wn.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Treat
       # Currently not implemented.
       class RitaWn
         # Require the Ruby-Java bridge.
-        #silently do
+        #silence_warnings do
           require 'rjb'
           # Load the RitaWN jars.
           Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])

data/lib/treat/lexicalizers/tag/brill.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Treat
         patch = false
         # Require the 'rbtagger' gem.
         begin
-          silently { require 'rbtagger' }
+          silence_warnings { require 'rbtagger' }
         # This whole mess is required to deal with
         # the fact that the 'rbtagger' gem defines
         # a top-level module called 'Word', which
@@ -73,6 +73,7 @@ module Treat
           # Create the tagger if necessary
           @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
           options[:lexical_rules], options[:contextual_rules])
+          entity.set :tag_set, :penn
           # Perform tagging.
           if entity.type == :word
             # Setup the context of the word

data/lib/treat/lexicalizers/tag/lingua.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Treat
       # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
       class Lingua
         # Require the 'engtagger' gem.
-        silently { require 'engtagger' }
+        silence_warnings { require 'engtagger' }
         # Hold the tagger.
         @@tagger = nil
         # Hold the user-set options
@@ -46,6 +46,7 @@ module Treat
             @@tagger = nil # Reset the tagger
           end
           @@tagger ||= ::EngTagger.new(@@options)
+          entity.set :tag_set, :penn
           left = entity.left
           if left.nil? || left.type != :word
             left_tag = 'pp'