treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,49 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module KeySentences
4
- class TopicsFrequency
5
-
6
- def self.key_sentences(entity, options = {})
7
- options[:threshold] ||= 4
8
- @@topics = options[:topic_words]
9
- if Treat::Entities.rank(entity.type) <
10
- Treat::Entities.rank(:sentence)
11
- raise Treat::Exception, 'Cannot get the key ' +
12
- 'sentences of an entity smaller than a sentence.'
13
- else
14
- sentence_scores = {}
15
- sentences = []
16
- entity.each_sentence do |sentence|
17
- sentence_scores[sentence.id] = score_sentence(sentence)
18
- end
19
- sentence_scores.each do |sid, score|
20
- if score >= options[:threshold]
21
- s = entity.find(sid)
22
- s.set :is_key_sentence?, true
23
- sentences << s
24
- end
25
- end
26
- end
27
- sentences
28
- end
29
-
30
- def self.score_sentence(sentence)
31
- sentence.set :topic_score, 0
32
- sentence.each_word do |word|
33
- found = false
34
- @@topics.each do |i, topic_words|
35
- if topic_words.include?(word.to_s)
36
- sentence.set :topic_score,
37
- (sentence.topic_score + 1)
38
- found = true
39
- end
40
- end
41
- word.set :is_keyword?, found
42
- end
43
- sentence.topic_score
44
- end
45
-
46
- end
47
- end
48
- end
49
- end
@@ -1,17 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Cleaners
4
- class HTML
5
- silence_warnings { require 'hpricot' }
6
- def self.clean(document, options = {})
7
- document.each_text do |text|
8
- text.set :html_value, text.value
9
- v = Hpricot(text.value).inner_text
10
- text.value = v
11
- end
12
- document
13
- end
14
- end
15
- end
16
- end
17
- end