treat 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
@@ -1,68 +0,0 @@
1
- #include "wn.h"
2
- #include "wnconsts.h"
3
- #include "ruby.h"
4
-
5
- /*
6
-
7
- Copyright (C) 2004 UTIYAMA Masao <mutiyama@crl.go.jp>
8
-
9
- This program is free software; you can redistribute it and/or modify
10
- it under the terms of the GNU General Public License as published by
11
- the Free Software Foundation; either version 2 of the License, or
12
- (at your option) any later version.
13
-
14
- This program is distributed in the hope that it will be useful,
15
- but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- MERCHANTABITreatY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- GNU General Public License for more details.
18
-
19
- You should have received a copy of the GNU General Public License
20
- along with this program; if not, write to the Free Software
21
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
- */
23
-
24
- static VALUE
25
- parse(VALUE klass, VALUE rb_word, VALUE rb_pos)
26
- {
27
- char *word;
28
- char *POS = STR2CSTR(rb_pos);
29
- char *lemma;
30
- int pos;
31
- VALUE retval = rb_ary_new();
32
-
33
- word = malloc(strlen(STR2CSTR(rb_word))+1);
34
- if(!word){rb_raise(rb_eStandardError, "malloc failed.\n");}
35
- strcpy(word, STR2CSTR(rb_word));
36
-
37
- if(strcmp(POS,"noun")==0){pos = NOUN;}
38
- else if(strcmp(POS,"verb")==0){pos = VERB;}
39
- else if(strcmp(POS,"adj")==0){pos = ADJ;}
40
- else if(strcmp(POS,"adv")==0){pos = ADV;}
41
- else{
42
- rb_raise(rb_eStandardError, "%s should be (noun|verb|adj|adv)\n", POS);
43
- }
44
- if(is_defined(word, pos)){
45
- /*printf("* %s found as is.\n", word);*/
46
- rb_ary_push(retval, rb_str_new2(word));
47
- }
48
- if((lemma=morphstr(word, pos))!=NULL){
49
- do {
50
- if(is_defined(lemma, pos)){
51
- /*printf("* %s => %s found.\n", word, lemma);*/
52
- rb_ary_push(retval, rb_str_new2(lemma));
53
- }
54
- } while((lemma=morphstr(NULL, pos))!=NULL);
55
- }
56
- free(word);
57
- return retval;
58
- }
59
-
60
- void
61
- Init_elemma()
62
- {
63
- VALUE mod = rb_define_module("ELemma");
64
- rb_define_module_function(mod, "parse", parse, 2);
65
- if(wninit()){
66
- rb_raise(rb_eStandardError, "Cannot open WordNet database\n");
67
- }
68
- }
@@ -1,6 +0,0 @@
1
- require 'mkmf'
2
-
3
- $CFLAGS = "-Wall -I/usr/local/WordNet-2.1/include/"
4
- $LOCAL_LIBS = "-L/usr/local/WordNet-2.1/lib -lwn"
5
-
6
- create_makefile("elemma")
@@ -1,12 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Lemmatizers
4
- class ELemma
5
- silently { require 'treat/inflectors/lemmatizers/elemma/elemma'}
6
- def self.lemma(entity, options = nil)
7
- ::ELemma::parse(word, entity.tag)
8
- end
9
- end
10
- end
11
- end
12
- end
@@ -1,18 +0,0 @@
1
- module Treat
2
- module Resources
3
- class Categories
4
- List = [
5
- :adjective, :adverb, :noun, :verb, :interjection,
6
- :clitic, :coverb, :conjunction, :determiner, :particle,
7
- :preposition, :pronoun, :number, :symbol, :punctuation,
8
- :complementizer
9
- ]
10
- wttc = {}
11
- Treat::Resources::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
12
- desc = desc.gsub(',', ' ,').split(' ')[0].downcase
13
- tags.each { |tag| wttc[tag] = desc.intern }
14
- end
15
- WordTagToCategory = wttc
16
- end
17
- end
18
- end
@@ -1,96 +0,0 @@
1
- module Treat
2
- module Resources
3
- module Delegates
4
- class English
5
- Extractors = {
6
- time: [:chronic],
7
- topics: [:reuters],
8
- topic_words: [:lda],
9
- key_sentences: [:topics_frequency]
10
- }
11
- Processors = {
12
- chunkers: [:txt],
13
- parsers: [:enju, :stanford],
14
- segmenters: [:tactful, :punkt, :stanford],
15
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
16
- }
17
- Lexicalizers = {
18
- category: [:from_tag],
19
- linkages: [:naive],
20
- synsets: [:wordnet, :rita_wn],
21
- tag: [:brill, :lingua, :stanford]
22
- }
23
- Inflectors = {
24
- conjugators: [:linguistics],
25
- declensors: [:linguistics, :english],
26
- lemmatizers: [:e_lemma],
27
- stemmers: [:porter_c, :porter, :uea],
28
- ordinal_words: [:linguistics],
29
- cardinal_words: [:linguistics]
30
- }
31
- end
32
- class German
33
- Extractors = {}
34
- Inflectors = {}
35
- Lexicalizers = {
36
- tag: [:stanford]
37
- }
38
- Processors = {
39
- chunkers: [:txt],
40
- parsers: [:stanford],
41
- segmenters: [:tactful, :punkt, :stanford],
42
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
43
- }
44
- end
45
- class French
46
- Extractors = {}
47
- Inflectors = {}
48
- Lexicalizers = {
49
- tag: [:stanford]
50
- }
51
- Processors = {
52
- chunkers: [:txt],
53
- parsers: [:stanford],
54
- segmenters: [:tactful, :punkt, :stanford],
55
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
56
- }
57
- end
58
- class Italian
59
- Extractors = {}
60
- Inflectors = {}
61
- Lexicalizers = {}
62
- Processors = {
63
- chunkers: [:txt],
64
- segmenters: [:tactful, :punkt, :stanford],
65
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
66
- }
67
- end
68
- class Arabic
69
- Extractors = {}
70
- Inflectors = {}
71
- Lexicalizers = {
72
- tag: [:stanford]
73
- }
74
- Processors = {
75
- parsers: [:stanford]
76
- }
77
- end
78
- class Chinese
79
- Extractors = {}
80
- Inflectors = {}
81
- Lexicalizers = {
82
- tag: [:stanford]
83
- }
84
- Processors = {}
85
- end
86
- class Xinhua
87
- Extractors = {}
88
- Inflectors = {}
89
- Lexicalizers = {}
90
- Processors = {
91
- parsers: [:stanford]
92
- }
93
- end
94
- end
95
- end
96
- end
File without changes
@@ -1,8 +0,0 @@
1
- # http://nlp.stanford.edu/software/dependencies_manual.pdf
2
-
3
- =begin
4
- ENJU
5
-
6
- pred: noun_arg0, noun_arg1, noun_arg2, noun_arg12, it_arg1, there_arg0, quote_arg2, quote_arg12, quote_arg23, quote_arg123, poss_arg2, poss_arg12, aux_arg12, aux_mod_arg12, verb_arg1, verb_arg12, verb_arg123, verb_arg1234, verb_mod_arg1, verb_mod_arg12, verb_mod_arg123, verb_mod_arg1234, adj_arg1, adj_arg12, adj_mod_arg1, adj_mod_arg12, conj_arg1, conj_arg12, conj_arg123, coord_arg12, det_arg1, prep_arg12, prep_arg123, prep_mod_arg12, prep_mod_arg123, lgs_arg2, dtv_arg2, punct_arg1, app_arg12, lparen_arg123, rparen_arg0, comp_arg1, comp_arg12, comp_mod_arg1, relative_arg1, relative_arg12
7
-
8
- =end
@@ -1,23 +0,0 @@
1
- module Treat
2
- module Resources
3
- module Format
4
-
5
- class XML
6
- require 'nokogiri'
7
- def self.validate(document_path, schema_path, root_element)
8
- schema = Nokogiri::XML::Schema(File.read(schema_path))
9
- document = Nokogiri::XML(File.read(document_path))
10
- schema.validate(document.xpath("//#{root_element}").to_s)
11
- end
12
- validate('input.xml', 'schema.xdf', 'container').each do |error|
13
- puts error.message
14
- end
15
- end
16
-
17
- class HTML < XML
18
-
19
- end
20
-
21
- end
22
- end
23
- end
@@ -1,86 +0,0 @@
1
- module Treat
2
- module Resources
3
- # Dictionnary of ISO-639-1, ISO-639-2 language codes,
4
- # as well as their full text description in both
5
- # English and French.
6
- module Languages
7
- ISO639_1 = 1
8
- ISO639_2 = 2
9
- # Describe a language code (ISO-639-1 or ISO-639-2)
10
- # or its full text description in full French or English.
11
- def self.describe(lang, desc_lang = :en)
12
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
13
- lang = find(lang).to_s
14
- if [:en, :eng, :english, :anglais].include?(desc_lang)
15
- l = @@english_full.key(lang)
16
- elsif [:fr, :fra, :french, :french].include?(desc_lang)
17
- l = @@french_full.key(lang)
18
- else
19
- raise Treat::Exception,
20
- "Unknown language to describe: #{desc_lang}."
21
- end
22
- not_found(lang) if l.nil?
23
- l.intern
24
- end
25
- # Raise an error message when a language code
26
- # or description is not found and suggest
27
- # possible misspellings.
28
- def self.not_found(lang)
29
- msg = "Language '#{lang}' does not exist."
30
- all = @@iso639_2.keys + @@iso639_1.keys +
31
- @@english_full.keys + @@french_full.keys
32
- msg += did_you_mean?(all, lang)
33
- raise Treat::Exception, msg
34
- end
35
- # Find a language by ISO-639-1 or ISO-639-2 code
36
- # or full name (in English or French) and return
37
- # the ISO-639-1 or ISO-639-2 language code as a
38
- # lowercase identifier.
39
- def self.find(lang, rc = ISO639_2)
40
- raise "Must provide a non-nil language identifier to describe." if lang.nil?
41
- get_languages
42
- lang = lang.to_s.downcase
43
- if @@iso639_1.has_key?(lang)
44
- return :"#{lang}" if rc == ISO639_1
45
- return :"#{@@iso639_1[lang]}" if rc == ISO639_2
46
- elsif @@iso639_2.has_key?(lang)
47
- return :"#{lang}" if rc == ISO639_2
48
- return :"#{@@iso639_2[lang]}" if rc == ISO639_1
49
- elsif @@english_full.has_key?(lang)
50
- return :"#{@@english_full[lang]}" if rc == ISO639_2
51
- return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
52
- elsif @@french_full.has_key?(lang)
53
- return :"#{@@french_full[lang]}" if rc == ISO639_2
54
- return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
55
- else
56
- not_found(lang)
57
- end
58
- end
59
- @@loaded = false
60
- # Get the languages from the dictionary.
61
- def self.get_languages
62
- return if @@loaded
63
- @@iso639_1 = {}; @@iso639_2 = {};
64
- @@english_full = {}; @@french_full = {}
65
- languages = IO.readlines(Treat.lib + '/treat/resources/languages.txt')
66
- languages.each do |language|
67
- iso639_2, iso639_1, english_desc, french_desc =
68
- language.split(',')
69
- @@iso639_1[iso639_1] = iso639_2
70
- @@iso639_2[iso639_2] = iso639_1
71
- unless english_desc.nil?
72
- english_desc.strip.downcase.split('|').each do |l|
73
- @@english_full[l.downcase.strip] = iso639_2
74
- end
75
- end
76
- unless french_desc.nil?
77
- french_desc.strip.downcase.split('|').each do |l|
78
- @@french_full[l.downcase.strip] = iso639_2
79
- end
80
- end
81
- end
82
- @@loaded = true
83
- end
84
- end
85
- end
86
- end
@@ -1,10 +0,0 @@
1
- module Treat
2
- module Resources
3
- require 'treat/resources/delegates'
4
- require 'treat/resources/dependencies'
5
- require 'treat/resources/edges'
6
- require 'treat/resources/languages'
7
- require 'treat/resources/tags'
8
- require 'treat/resources/categories'
9
- end
10
- end
@@ -1,127 +0,0 @@
1
- module Treat
2
- # Provides utility functions used across the library.
3
- module Utilities
4
- # Require file utilities.
5
- require 'fileutils'
6
- # Returns the platform we are running on.
7
- def self.platform
8
- RUBY_PLATFORM.split("-")[1]
9
- end
10
- # Runs a block of code silently, i.e. without
11
- # expressing warnings even in verbose mode.
12
- # Rename to silence_streamsings.
13
- def self.silently(&block)
14
- warn_level = $VERBOSE
15
- $VERBOSE = nil
16
- result = block.call
17
- $VERBOSE = warn_level
18
- result
19
- end
20
- def self.silence_streams(*streams)
21
- yield
22
- end
23
- # Create a temporary file which is deleted
24
- # after execution of the block.
25
- require 'tempfile'
26
- def self.create_temp_file(ext, value = nil, &block)
27
- tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
28
- tmp.puts(value) if value
29
- block.call(tmp.path)
30
- end
31
- # A list of acronyms used in class names within
32
- # the program. These do not CamelCase; they
33
- # CAMELCASE.
34
- @@acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR', 'Treat'].join('|')
35
- @@cc_cache = {}
36
- # Convert un_camel_case to CamelCase.
37
- def self.camel_case(o_phrase)
38
- phrase = o_phrase.to_s.dup
39
- return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
40
- phrase.gsub!(/#{@@acronyms.downcase}[^a-z]+/) { |a| a.upcase }
41
- phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
42
- phrase.gsub!('_', '')
43
- @@cc_cache[o_phrase] = phrase
44
- phrase
45
- end
46
- @@ucc_cache = {}
47
- # Convert CamelCase to un_camel_case.
48
- def self.un_camel_case(o_phrase)
49
- phrase = o_phrase.to_s.dup
50
- return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
51
- phrase.gsub!(/#{@@acronyms}/) { |a| a.downcase.capitalize }
52
- phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
53
- phrase = phrase[1..-1] if phrase[0] == '_'
54
- @@ucc_cache[o_phrase] = phrase
55
- phrase
56
- end
57
- # Return the levensthein distance between two stringsm
58
- # taking into account the costs of insertion, deletion,
59
- # and substitution. Stolen from:
60
- # http://ruby-snippets.heroku.com/string/levenshtein-distance
61
- def self.levenshtein(first, other, ins=1, del=1, sub=1)
62
- return nil if first.nil? || other.nil?
63
- dm = []
64
- dm[0] = (0..first.length).collect { |i| i * ins}
65
- fill = [0] * (first.length - 1)
66
- for i in 1..other.length
67
- dm[i] = [i * del, fill.flatten]
68
- end
69
- for i in 1..other.length
70
- for j in 1..first.length
71
- dm[i][j] = [
72
- dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
73
- dm[i][j-1] + ins,
74
- dm[i-1][j] + del
75
- ].min
76
- end
77
- end
78
- dm[other.length][first.length]
79
- end
80
- # Search the list to see if there are words
81
- # similar to name. If yes, return a string
82
- # saying "Did you mean ... ?"
83
- def self.did_you_mean?(list, name)
84
- msg = ''
85
- sugg = []
86
- list.each do |element|
87
- l = levenshtein(element,name)
88
- if l > 0 && l < 2
89
- sugg << element
90
- end
91
- end
92
- unless sugg.empty?
93
- if sugg.size == 1
94
- msg += " Perhaps you meant '#{sugg[0]}' ?"
95
- else
96
- sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
97
- msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
98
- " or '#{sugg[-1]}' ?"
99
- end
100
- end
101
- msg
102
- end
103
- def self.caller_method(n = 3)
104
- at = caller(n).first
105
- /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
106
- :"#{Regexp.last_match[3]}"
107
- end
108
- end
109
- end
110
-
111
- # Make undefining constants publicly
112
- # available on any object.
113
- Object.module_eval do
114
- def self.const_unset(const); Object.instance_eval { remove_const(const) }; puts const; end
115
- end
116
-
117
- # Make the most common utility functions available in the global scope.
118
- def create_temp_file(ext, value = nil, &block)
119
- Treat::Utilities.create_temp_file(ext, value) { |f| block.call(f) }
120
- end
121
- def silence_streams(*streams); Treat::Utilities.silence_streams(*streams) { yield }; end
122
- def silently(&block); Treat::Utilities.silently { block.call }; end
123
- def cc(w); Treat::Utilities.camel_case(w); end
124
- def ucc(w); Treat::Utilities.un_camel_case(w); end
125
- def cl(n); n.to_s.split('::')[-1]; end
126
- def did_you_mean?(l, e); Treat::Utilities.did_you_mean?(l, e); end
127
- def caller_method(n = 3); Treat::Utilities.caller_method(n); end