treat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,49 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module KeySentences
4
- class TopicsFrequency
5
-
6
- def self.key_sentences(entity, options = {})
7
- options[:threshold] ||= 4
8
- @@topics = options[:topic_words]
9
- if Treat::Entities.rank(entity.type) <
10
- Treat::Entities.rank(:sentence)
11
- raise Treat::Exception, 'Cannot get the key ' +
12
- 'sentences of an entity smaller than a sentence.'
13
- else
14
- sentence_scores = {}
15
- sentences = []
16
- entity.each_sentence do |sentence|
17
- sentence_scores[sentence.id] = score_sentence(sentence)
18
- end
19
- sentence_scores.each do |sid, score|
20
- if score >= options[:threshold]
21
- s = entity.find(sid)
22
- s.set :is_key_sentence?, true
23
- sentences << s
24
- end
25
- end
26
- end
27
- sentences
28
- end
29
-
30
- def self.score_sentence(sentence)
31
- sentence.set :topic_score, 0
32
- sentence.each_word do |word|
33
- found = false
34
- @@topics.each do |i, topic_words|
35
- if topic_words.include?(word.to_s)
36
- sentence.set :topic_score,
37
- (sentence.topic_score + 1)
38
- found = true
39
- end
40
- end
41
- word.set :is_keyword?, found
42
- end
43
- sentence.topic_score
44
- end
45
-
46
- end
47
- end
48
- end
49
- end
@@ -1,17 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Cleaners
4
- class HTML
5
- silence_warnings { require 'hpricot' }
6
- def self.clean(document, options = {})
7
- document.each_text do |text|
8
- text.set :html_value, text.value
9
- v = Hpricot(text.value).inner_text
10
- text.value = v
11
- end
12
- document
13
- end
14
- end
15
- end
16
- end
17
- end