treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
@@ -1,68 +0,0 @@
1
- #include "wn.h"
2
- #include "wnconsts.h"
3
- #include "ruby.h"
4
-
5
- /*
6
-
7
- Copyright (C) 2004 UTIYAMA Masao <mutiyama@crl.go.jp>
8
-
9
- This program is free software; you can redistribute it and/or modify
10
- it under the terms of the GNU General Public License as published by
11
- the Free Software Foundation; either version 2 of the License, or
12
- (at your option) any later version.
13
-
14
- This program is distributed in the hope that it will be useful,
15
- but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- MERCHANTABITreatY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- GNU General Public License for more details.
18
-
19
- You should have received a copy of the GNU General Public License
20
- along with this program; if not, write to the Free Software
21
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
- */
23
-
24
- static VALUE
25
- parse(VALUE klass, VALUE rb_word, VALUE rb_pos)
26
- {
27
- char *word;
28
- char *POS = STR2CSTR(rb_pos);
29
- char *lemma;
30
- int pos;
31
- VALUE retval = rb_ary_new();
32
-
33
- word = malloc(strlen(STR2CSTR(rb_word))+1);
34
- if(!word){rb_raise(rb_eStandardError, "malloc failed.\n");}
35
- strcpy(word, STR2CSTR(rb_word));
36
-
37
- if(strcmp(POS,"noun")==0){pos = NOUN;}
38
- else if(strcmp(POS,"verb")==0){pos = VERB;}
39
- else if(strcmp(POS,"adj")==0){pos = ADJ;}
40
- else if(strcmp(POS,"adv")==0){pos = ADV;}
41
- else{
42
- rb_raise(rb_eStandardError, "%s should be (noun|verb|adj|adv)\n", POS);
43
- }
44
- if(is_defined(word, pos)){
45
- /*printf("* %s found as is.\n", word);*/
46
- rb_ary_push(retval, rb_str_new2(word));
47
- }
48
- if((lemma=morphstr(word, pos))!=NULL){
49
- do {
50
- if(is_defined(lemma, pos)){
51
- /*printf("* %s => %s found.\n", word, lemma);*/
52
- rb_ary_push(retval, rb_str_new2(lemma));
53
- }
54
- } while((lemma=morphstr(NULL, pos))!=NULL);
55
- }
56
- free(word);
57
- return retval;
58
- }
59
-
60
- void
61
- Init_elemma()
62
- {
63
- VALUE mod = rb_define_module("ELemma");
64
- rb_define_module_function(mod, "parse", parse, 2);
65
- if(wninit()){
66
- rb_raise(rb_eStandardError, "Cannot open WordNet database\n");
67
- }
68
- }
@@ -1,6 +0,0 @@
1
- require 'mkmf'
2
-
3
- $CFLAGS = "-Wall -I/usr/local/WordNet-2.1/include/"
4
- $LOCAL_LIBS = "-L/usr/local/WordNet-2.1/lib -lwn"
5
-
6
- create_makefile("elemma")
@@ -1,12 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Lemmatizers
4
- class ELemma
5
- silently { require 'treat/inflectors/lemmatizers/elemma/elemma'}
6
- def self.lemma(entity, options = nil)
7
- ::ELemma::parse(word, entity.tag)
8
- end
9
- end
10
- end
11
- end
12
- end
@@ -1,18 +0,0 @@
1
- module Treat
2
- module Resources
3
- class Categories
4
- List = [
5
- :adjective, :adverb, :noun, :verb, :interjection,
6
- :clitic, :coverb, :conjunction, :determiner, :particle,
7
- :preposition, :pronoun, :number, :symbol, :punctuation,
8
- :complementizer
9
- ]
10
- wttc = {}
11
- Treat::Resources::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
12
- desc = desc.gsub(',', ' ,').split(' ')[0].downcase
13
- tags.each { |tag| wttc[tag] = desc.intern }
14
- end
15
- WordTagToCategory = wttc
16
- end
17
- end
18
- end
@@ -1,96 +0,0 @@
1
- module Treat
2
- module Resources
3
- module Delegates
4
- class English
5
- Extractors = {
6
- time: [:chronic],
7
- topics: [:reuters],
8
- topic_words: [:lda],
9
- key_sentences: [:topics_frequency]
10
- }
11
- Processors = {
12
- chunkers: [:txt],
13
- parsers: [:enju, :stanford],
14
- segmenters: [:tactful, :punkt, :stanford],
15
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
16
- }
17
- Lexicalizers = {
18
- category: [:from_tag],
19
- linkages: [:naive],
20
- synsets: [:wordnet, :rita_wn],
21
- tag: [:brill, :lingua, :stanford]
22
- }
23
- Inflectors = {
24
- conjugators: [:linguistics],
25
- declensors: [:linguistics, :english],
26
- lemmatizers: [:e_lemma],
27
- stemmers: [:porter_c, :porter, :uea],
28
- ordinal_words: [:linguistics],
29
- cardinal_words: [:linguistics]
30
- }
31
- end
32
- class German
33
- Extractors = {}
34
- Inflectors = {}
35
- Lexicalizers = {
36
- tag: [:stanford]
37
- }
38
- Processors = {
39
- chunkers: [:txt],
40
- parsers: [:stanford],
41
- segmenters: [:tactful, :punkt, :stanford],
42
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
43
- }
44
- end
45
- class French
46
- Extractors = {}
47
- Inflectors = {}
48
- Lexicalizers = {
49
- tag: [:stanford]
50
- }
51
- Processors = {
52
- chunkers: [:txt],
53
- parsers: [:stanford],
54
- segmenters: [:tactful, :punkt, :stanford],
55
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
56
- }
57
- end
58
- class Italian
59
- Extractors = {}
60
- Inflectors = {}
61
- Lexicalizers = {}
62
- Processors = {
63
- chunkers: [:txt],
64
- segmenters: [:tactful, :punkt, :stanford],
65
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
66
- }
67
- end
68
- class Arabic
69
- Extractors = {}
70
- Inflectors = {}
71
- Lexicalizers = {
72
- tag: [:stanford]
73
- }
74
- Processors = {
75
- parsers: [:stanford]
76
- }
77
- end
78
- class Chinese
79
- Extractors = {}
80
- Inflectors = {}
81
- Lexicalizers = {
82
- tag: [:stanford]
83
- }
84
- Processors = {}
85
- end
86
- class Xinhua
87
- Extractors = {}
88
- Inflectors = {}
89
- Lexicalizers = {}
90
- Processors = {
91
- parsers: [:stanford]
92
- }
93
- end
94
- end
95
- end
96
- end
File without changes
@@ -1,8 +0,0 @@
1
- # http://nlp.stanford.edu/software/dependencies_manual.pdf
2
-
3
- =begin
4
- ENJU
5
-
6
- pred: noun_arg0, noun_arg1, noun_arg2, noun_arg12, it_arg1, there_arg0, quote_arg2, quote_arg12, quote_arg23, quote_arg123, poss_arg2, poss_arg12, aux_arg12, aux_mod_arg12, verb_arg1, verb_arg12, verb_arg123, verb_arg1234, verb_mod_arg1, verb_mod_arg12, verb_mod_arg123, verb_mod_arg1234, adj_arg1, adj_arg12, adj_mod_arg1, adj_mod_arg12, conj_arg1, conj_arg12, conj_arg123, coord_arg12, det_arg1, prep_arg12, prep_arg123, prep_mod_arg12, prep_mod_arg123, lgs_arg2, dtv_arg2, punct_arg1, app_arg12, lparen_arg123, rparen_arg0, comp_arg1, comp_arg12, comp_mod_arg1, relative_arg1, relative_arg12
7
-
8
- =end
@@ -1,23 +0,0 @@
1
- module Treat
2
- module Resources
3
- module Format
4
-
5
- class XML
6
- require 'nokogiri'
7
- def self.validate(document_path, schema_path, root_element)
8
- schema = Nokogiri::XML::Schema(File.read(schema_path))
9
- document = Nokogiri::XML(File.read(document_path))
10
- schema.validate(document.xpath("//#{root_element}").to_s)
11
- end
12
- validate('input.xml', 'schema.xdf', 'container').each do |error|
13
- puts error.message
14
- end
15
- end
16
-
17
- class HTML < XML
18
-
19
- end
20
-
21
- end
22
- end
23
- end
@@ -1,86 +0,0 @@
1
- module Treat
2
- module Resources
3
- # Dictionnary of ISO-639-1, ISO-639-2 language codes,
4
- # as well as their full text description in both
5
- # English and French.
6
- module Languages
7
- ISO639_1 = 1
8
- ISO639_2 = 2
9
- # Describe a language code (ISO-639-1 or ISO-639-2)
10
- # or its full text description in full French or English.
11
- def self.describe(lang, desc_lang = :en)
12
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
13
- lang = find(lang).to_s
14
- if [:en, :eng, :english, :anglais].include?(desc_lang)
15
- l = @@english_full.key(lang)
16
- elsif [:fr, :fra, :french, :french].include?(desc_lang)
17
- l = @@french_full.key(lang)
18
- else
19
- raise Treat::Exception,
20
- "Unknown language to describe: #{desc_lang}."
21
- end
22
- not_found(lang) if l.nil?
23
- l.intern
24
- end
25
- # Raise an error message when a language code
26
- # or description is not found and suggest
27
- # possible misspellings.
28
- def self.not_found(lang)
29
- msg = "Language '#{lang}' does not exist."
30
- all = @@iso639_2.keys + @@iso639_1.keys +
31
- @@english_full.keys + @@french_full.keys
32
- msg += did_you_mean?(all, lang)
33
- raise Treat::Exception, msg
34
- end
35
- # Find a language by ISO-639-1 or ISO-639-2 code
36
- # or full name (in English or French) and return
37
- # the ISO-639-1 or ISO-639-2 language code as a
38
- # lowercase identifier.
39
- def self.find(lang, rc = ISO639_2)
40
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
41
- get_languages
42
- lang = lang.to_s.downcase
43
- if @@iso639_1.has_key?(lang)
44
- return :"#{lang}" if rc == ISO639_1
45
- return :"#{@@iso639_1[lang]}" if rc == ISO639_2
46
- elsif @@iso639_2.has_key?(lang)
47
- return :"#{lang}" if rc == ISO639_2
48
- return :"#{@@iso639_2[lang]}" if rc == ISO639_1
49
- elsif @@english_full.has_key?(lang)
50
- return :"#{@@english_full[lang]}" if rc == ISO639_2
51
- return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
52
- elsif @@french_full.has_key?(lang)
53
- return :"#{@@french_full[lang]}" if rc == ISO639_2
54
- return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
55
- else
56
- not_found(lang)
57
- end
58
- end
59
- @@loaded = false
60
- # Get the languages from the dictionary.
61
- def self.get_languages
62
- return if @@loaded
63
- @@iso639_1 = {}; @@iso639_2 = {};
64
- @@english_full = {}; @@french_full = {}
65
- languages = IO.readlines(Treat.lib + '/treat/resources/languages.txt')
66
- languages.each do |language|
67
- iso639_2, iso639_1, english_desc, french_desc =
68
- language.split(',')
69
- @@iso639_1[iso639_1] = iso639_2
70
- @@iso639_2[iso639_2] = iso639_1
71
- unless english_desc.nil?
72
- english_desc.strip.downcase.split('|').each do |l|
73
- @@english_full[l.downcase.strip] = iso639_2
74
- end
75
- end
76
- unless french_desc.nil?
77
- french_desc.strip.downcase.split('|').each do |l|
78
- @@french_full[l.downcase.strip] = iso639_2
79
- end
80
- end
81
- end
82
- @@loaded = true
83
- end
84
- end
85
- end
86
- end
@@ -1,10 +0,0 @@
1
- module Treat
2
- module Resources
3
- require 'treat/resources/delegates'
4
- require 'treat/resources/dependencies'
5
- require 'treat/resources/edges'
6
- require 'treat/resources/languages'
7
- require 'treat/resources/tags'
8
- require 'treat/resources/categories'
9
- end
10
- end
@@ -1,127 +0,0 @@
1
- module Treat
2
- # Provides utility functions used across the library.
3
- module Utilities
4
- # Require file utilities.
5
- require 'fileutils'
6
- # Returns the platform we are running on.
7
- def self.platform
8
- RUBY_PLATFORM.split("-")[1]
9
- end
10
- # Runs a block of code silently, i.e. without
11
- # expressing warnings even in verbose mode.
12
- # Rename to silence_streamsings.
13
- def self.silently(&block)
14
- warn_level = $VERBOSE
15
- $VERBOSE = nil
16
- result = block.call
17
- $VERBOSE = warn_level
18
- result
19
- end
20
- def self.silence_streams(*streams)
21
- yield
22
- end
23
- # Create a temporary file which is deleted
24
- # after execution of the block.
25
- require 'tempfile'
26
- def self.create_temp_file(ext, value = nil, &block)
27
- tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
28
- tmp.puts(value) if value
29
- block.call(tmp.path)
30
- end
31
- # A list of acronyms used in class names within
32
- # the program. These do not CamelCase; they
33
- # CAMELCASE.
34
- @@acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR', 'Treat'].join('|')
35
- @@cc_cache = {}
36
- # Convert un_camel_case to CamelCase.
37
- def self.camel_case(o_phrase)
38
- phrase = o_phrase.to_s.dup
39
- return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
40
- phrase.gsub!(/#{@@acronyms.downcase}[^a-z]+/) { |a| a.upcase }
41
- phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
42
- phrase.gsub!('_', '')
43
- @@cc_cache[o_phrase] = phrase
44
- phrase
45
- end
46
- @@ucc_cache = {}
47
- # Convert CamelCase to un_camel_case.
48
- def self.un_camel_case(o_phrase)
49
- phrase = o_phrase.to_s.dup
50
- return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
51
- phrase.gsub!(/#{@@acronyms}/) { |a| a.downcase.capitalize }
52
- phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
53
- phrase = phrase[1..-1] if phrase[0] == '_'
54
- @@ucc_cache[o_phrase] = phrase
55
- phrase
56
- end
57
- # Return the levensthein distance between two stringsm
58
- # taking into account the costs of insertion, deletion,
59
- # and substitution. Stolen from:
60
- # http://ruby-snippets.heroku.com/string/levenshtein-distance
61
- def self.levenshtein(first, other, ins=1, del=1, sub=1)
62
- return nil if first.nil? || other.nil?
63
- dm = []
64
- dm[0] = (0..first.length).collect { |i| i * ins}
65
- fill = [0] * (first.length - 1)
66
- for i in 1..other.length
67
- dm[i] = [i * del, fill.flatten]
68
- end
69
- for i in 1..other.length
70
- for j in 1..first.length
71
- dm[i][j] = [
72
- dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
73
- dm[i][j-1] + ins,
74
- dm[i-1][j] + del
75
- ].min
76
- end
77
- end
78
- dm[other.length][first.length]
79
- end
80
- # Search the list to see if there are words
81
- # similar to name. If yes, return a string
82
- # saying "Did you mean ... ?"
83
- def self.did_you_mean?(list, name)
84
- msg = ''
85
- sugg = []
86
- list.each do |element|
87
- l = levenshtein(element,name)
88
- if l > 0 && l < 2
89
- sugg << element
90
- end
91
- end
92
- unless sugg.empty?
93
- if sugg.size == 1
94
- msg += " Perhaps you meant '#{sugg[0]}' ?"
95
- else
96
- sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
97
- msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
98
- " or '#{sugg[-1]}' ?"
99
- end
100
- end
101
- msg
102
- end
103
- def self.caller_method(n = 3)
104
- at = caller(n).first
105
- /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
106
- :"#{Regexp.last_match[3]}"
107
- end
108
- end
109
- end
110
-
111
- # Make undefining constants publicly
112
- # available on any object.
113
- Object.module_eval do
114
- def self.const_unset(const); Object.instance_eval { remove_const(const) }; puts const; end
115
- end
116
-
117
- # Make the most common utility functions available in the global scope.
118
- def create_temp_file(ext, value = nil, &block)
119
- Treat::Utilities.create_temp_file(ext, value) { |f| block.call(f) }
120
- end
121
- def silence_streams(*streams); Treat::Utilities.silence_streams(*streams) { yield }; end
122
- def silently(&block); Treat::Utilities.silently { block.call }; end
123
- def cc(w); Treat::Utilities.camel_case(w); end
124
- def ucc(w); Treat::Utilities.un_camel_case(w); end
125
- def cl(n); n.to_s.split('::')[-1]; end
126
- def did_you_mean?(l, e); Treat::Utilities.did_you_mean?(l, e); end
127
- def caller_method(n = 3); Treat::Utilities.caller_method(n); end