treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,37 +1,41 @@
1
- module Treat
2
- # Formatters handle conversion of Entities to and from
3
- # external file formats.
4
- module Formatters
5
- # Readers read a document and create the top-level entity
6
- # corresponding to the content of the document.
7
- module Readers
8
- extend Group
9
- self.type = :transformer
10
- self.targets = [:collection, :document]
11
- self.default = :autoselect
12
- end
13
- # Serializers transform entities into a storable format.
14
- module Serializers
15
- extend Group
16
- self.type = :computer
17
- self.targets = [:entity]
18
- self.default = :yaml
19
- end
20
- # Unserializers recreate entities from a serialized format.
21
- module Unserializers
22
- extend Group
23
- self.type = :transformer
24
- self.targets = [:collection, :document]
25
- self.default = :autoselect
26
- end
27
- # Visualizers transform entities into a visualizable format.
28
- module Visualizers
29
- extend Group
30
- self.type = :computer
31
- self.targets = [:entity]
32
- self.default = :tree
33
- end
34
- extend Treat::Category
1
+ # Formatters handle conversion of Entities to and from
2
+ # external file formats.
3
+ module Treat::Formatters
4
+
5
+ # Readers read a document's content.
6
+ module Readers
7
+ extend Treat::Groupable
8
+ self.type = :computer
9
+ self.targets = [:document]
35
10
  end
11
+
12
+ # Unserializers recreate entities
13
+ # from a serialized format.
14
+ module Unserializers
15
+ extend Treat::Groupable
16
+ self.type = :computer
17
+ self.targets = [:entity]
18
+ end
19
+
20
+ # Serializers transform entities
21
+ # into a storable format.
22
+ module Serializers
23
+ extend Treat::Groupable
24
+ self.type = :computer
25
+ self.targets = [:entity]
26
+ self.default = :yaml
27
+ end
28
+
29
+ # Visualizers transform entities
30
+ # into a visualizable format.
31
+ module Visualizers
32
+ extend Treat::Groupable
33
+ self.type = :computer
34
+ self.targets = [:entity]
35
+ self.default = :tree
36
+ end
37
+
38
+ # Make Formatters categorizable.
39
+ extend Treat::Categorizable
40
+
36
41
  end
37
-
@@ -1,33 +1,53 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- class Abw
5
- require 'rexml/document'
6
- require 'rexml/streamlistener'
7
- def self.read(document, options = {})
8
- xml_h = AbiWordXmlHandler.new
9
- REXML::Document.parse_stream(IO.read(document.file), xml_h)
10
- document << Treat::Entities::Entity.from_string(xml_h.plain_text)
11
- document
12
- end
13
- class AbiWordXmlHandler
14
- include REXML::StreamListener
15
- attr_reader :plain_text
16
- def initialize
17
- @plain_text = ""
18
- end
19
- def text(s)
20
- if s != 'AbiWord' && s != 'application/x-abiword'
21
- s.strip!
22
- if s.length > 0
23
- s += ' '
24
- s += "\n\n" if s.length < 60
25
- end
26
- @plain_text << s
27
- end
28
- end
1
+ # A wrapper for a small utility written
2
+ # by Mark Watson to read AbiWord files.
3
+ # Released under the GPL.
4
+ #
5
+ # Original project website:
6
+ # http://www.markwatson.com/opensource/
7
+ #
8
+ # Todo: reimplement with Nokogiri and use
9
+ # XML node information to better translate
10
+ # the format of the text.
11
+ class Treat::Formatters::Readers::ABW
12
+
13
+ silence_warnings do
14
+ require 'rexml/document'
15
+ require 'rexml/streamlistener'
16
+ end
17
+
18
+ # Extract the readable text from an AbiWord file.
19
+ #
20
+ # Options: none.
21
+ def self.read(document, options = {})
22
+
23
+ xml_h = ABWXmlHandler.new
24
+ REXML::Document.parse_stream(
25
+ IO.read(document.file), xml_h)
26
+
27
+ document.value = xml_h.plain_text
28
+ document.set :format, :abw_word
29
+ document
30
+
31
+ end
32
+
33
+ # Helper class to parse the AbiWord file.
34
+ class ABWXmlHandler
35
+ include REXML::StreamListener
36
+ attr_reader :plain_text
37
+ def initialize
38
+ @plain_text = ""
39
+ end
40
+ def text(s)
41
+ if s != 'AbiWord' && s !=
42
+ 'application/x-abiword'
43
+ s.strip!
44
+ if s.length > 0
45
+ s += ' '
46
+ s += "\n\n" if s.length < 45
29
47
  end
48
+ @plain_text << s
30
49
  end
31
50
  end
32
51
  end
33
- end
52
+
53
+ end
@@ -1,35 +1,39 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # This class isn't a wrapper for anything.
5
- # It simply delegates the reading task to
6
- # the appropriate reader based on the file
7
- # extension of the supplied document.
8
- class Autoselect
9
- # A list of image extensions that should be routed to Ocropus.
10
- ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
11
- # Select the appropriate reader based on the format
12
- # of the filename in document.
13
- #
14
- # Options:
15
- #
16
- # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
17
- def self.read(document, options)
18
- ext = document.file.split('.')[-1]
19
- reader = ImageExtensions.include?(ext) ? 'image' : ext
20
- reader = 'html' if reader == 'htm'
21
- reader = 'yaml' if reader == 'yml'
22
- begin
23
- r = Treat::Formatters::Readers.const_get(cc(reader))
24
- rescue NameError
25
- raise Treat::Exception,
26
- "Cannot find a reader for format: '#{ext}'."
27
- end
28
- document = r.read(document, options)
29
- document.set :encoding, document.to_s.encoding.to_s.downcase
30
- document
31
- end
32
- end
33
- end
1
+ class Treat::Formatters::Readers::Autoselect
2
+
3
+ ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
4
+ ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
5
+ DefaultOptions = {
6
+ :default_to => :txt
7
+ }
8
+
9
+ # Choose a reader to use.
10
+ #
11
+ # Options:
12
+ # - (Symbol) :default_to => format to default to.
13
+ def self.read(document, options = {})
14
+ options = DefaultOptions.merge(options)
15
+ document.read(detect_format(document.file, options[:default_to]))
34
16
  end
17
+
18
+ def self.detect_format(filename, default_to = DefaultOptions[:default_to])
19
+
20
+ ext = filename.scan(ExtensionRegexp)
21
+ ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
22
+ ext[0][0] : ''
23
+
24
+ format =
25
+ ImageExtensions.include?(ext) ?
26
+ 'image' : ext
27
+
28
+ # Humanize extensions.
29
+ format = 'html' if format == 'htm'
30
+ format = 'yaml' if format == 'yml'
31
+
32
+ format = default_to if format == ''
33
+
34
+ format.intern
35
+
36
+ end
37
+
38
+
35
39
  end
@@ -1,15 +1,21 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- class Doc
5
- def self.read(document, options = {})
6
- f = `antiword #{document.file}`
7
- f.gsub!("\n\n", '#keep#')
8
- f.gsub!("\n", ' ')
9
- f.gsub!('#keep#', "\n\n")
10
- document << Treat::Entities::Entity.from_string(f)
11
- end
12
- end
13
- end
1
+ # A wrapper for the 'antiword' command-line utility.
2
+ class Treat::Formatters::Readers::DOC
3
+
4
+ # Extract the readable text from a DOC file
5
+ # using the antiword command-line utility.
6
+ #
7
+ # Options: none.
8
+ def self.read(document, options = {})
9
+
10
+ f = `antiword #{document.file}`
11
+ f.gsub!("\n\n", '#keep#')
12
+ f.gsub!("\n", ' ')
13
+ f.gsub!('#keep#', "\n\n")
14
+
15
+ document.value = f
16
+ document.set :format, :doc
17
+ document
18
+
14
19
  end
20
+
15
21
  end
@@ -1,33 +1,55 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # A temporary HTML reader; simply strips the
5
- # document of all of its markup.
6
- class HTML
7
- # Require Hpricot.
8
- silence_warnings { require 'hpricot' }
9
- # By default, backup the HTML text while cleaning.
10
- DefaultOptions = { :clean => true, :backup => false }
11
- # Read the HTML document and strip it of its markup.
12
- #
13
- # Options:
14
- #
15
- # - (Boolean) :clean => whether to strip HTML markup.
16
- # - (Boolean) :backup => whether to backup the HTML
17
- # markup while cleaning.
18
- def self.read(document, options = {})
19
- options = DefaultOptions.merge(options)
20
- f = File.read(document.file)
21
- document << Treat::Entities::Entity.from_string(f)
22
- if options[:clean]
23
- document.each do |section|
24
- section.set :html_value, section.value if options[:backup]
25
- section.value = Hpricot(section.value).inner_text
26
- end
27
- end
28
- document
29
- end
30
- end
1
+ # This class is a wrapper for the 'ruby-readability'
2
+ # gem, which extracts the primary readable content
3
+ # of a web page by using set of handwritten rules.
4
+ #
5
+ # Project homepage:
6
+ # https://github.com/iterationlabs/ruby-readability
7
+ class Treat::Formatters::Readers::HTML
8
+
9
+ silence_warnings { require 'ruby-readability' }
10
+
11
+ # By default, don't backup the original HTML
12
+ DefaultOptions = {
13
+ :keep_html => false,
14
+ :tags => %w[p div h1 h2 h3 ul ol dl dt li]
15
+ }
16
+
17
+ # Read the HTML document and strip it of its markup.
18
+ #
19
+ # Options:
20
+ #
21
+ # text when cleaning the document (default: false).
22
+ # - (Boolean) :remove_empty_nodes => remove <p> tags
23
+ # that have no text content
24
+ # - (String) :encoding => if the page is of a known
25
+ # encoding, you can specify it; if left unspecified,
26
+ # the encoding will be guessed (only in Ruby 1.9.x)
27
+ # - (String) :html_headers => in Ruby 1.9.x these will
28
+ # be passed to the guess_html_encoding gem to aid with
29
+ # guessing the HTML encoding.
30
+ # - (Array of String) :tags => the base whitelist of
31
+ # tags to sanitize, defaults to %w[div p].
32
+ # also removes p tags that contain only images
33
+ # - (Array of String) :attributes => list allowed attributes
34
+ # - (Array of String) :ignore_image_format => for use with images.
35
+ # - (Numeric) :min_image_height => minimum image height for images.
36
+ # - (Numeric) :min_image_width => minimum image width for images.
37
+ def self.read(document, options = {})
38
+
39
+ # set encoding with the guess_html_encoding
40
+ options = DefaultOptions.merge(options)
41
+ html = File.read(document.file)
42
+
43
+ silence_warnings do
44
+ # Strip comments
45
+ html.gsub!(/<!--[^>]*-->/m, '')
46
+ d = Readability::Document.new(html, options)
47
+ document.value = "<h1>#{d.title}</h1>\n" + d.content
48
+ document.set :format, :html
31
49
  end
50
+
51
+ document
52
+
32
53
  end
54
+
33
55
  end
@@ -1,43 +1,44 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # This class is a wrapper for the Google Ocropus
5
- # optical character recognition (OCR) engine.
6
- #
7
- # "OCRopus(tm) is a state-of-the-art document
8
- # analysis and OCR system, featuring pluggable
9
- # layout analysis, pluggable character recognition,
10
- # statistical natural language modeling, and multi-
11
- # lingual capabilities."
12
- #
13
- # Original paper:
14
- # Breuel, Thomas M. The Ocropus Open Source OCR System.
15
- # DFKI and U. Kaiserslautern, Germany.
16
- class Image
17
- # Read a file using the Google Ocropus reader.
18
- #
19
- # Options:
20
- # - (Boolean) :silent => whether to silence Ocropus.
21
- def self.read(document, options = {})
22
- read = lambda do |doc|
23
- create_temp_dir do |tmp|
24
- `ocropus book2pages #{tmp}/out #{doc.file}`
25
- `ocropus pages2lines #{tmp}/out`
26
- `ocropus lines2fsts #{tmp}/out`
27
- `ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
28
- f = document.file
29
- doc.remove_all!
30
- doc.set :file, "#{tmp}/output.html"
31
- doc.read(:html)
32
- doc.set :file, f
33
- end
34
- end
35
- options[:silent] ?
36
- silence_stdout { read.call(document) } :
37
- read.call(document)
38
- document
39
- end
1
+ # This class is a wrapper for the Google Ocropus
2
+ # optical character recognition (OCR) engine.
3
+ #
4
+ # "OCRopus(tm) is a state-of-the-art document
5
+ # analysis and OCR system, featuring pluggable
6
+ # layout analysis, pluggable character recognition,
7
+ # statistical natural language modeling, and multi-
8
+ # lingual capabilities."
9
+ #
10
+ # Original paper:
11
+ #
12
+ # Breuel, Thomas M. The Ocropus Open Source OCR System.
13
+ # DFKI and U. Kaiserslautern, Germany.
14
+ class Treat::Formatters::Readers::Image
15
+
16
+ # Read a file using the Google Ocropus reader.
17
+ #
18
+ # Options:
19
+ #
20
+ # - (Boolean) :silent => whether to silence Ocropus.
21
+ def self.read(document, options = {})
22
+
23
+ read = lambda do |doc|
24
+ create_temp_dir do |tmp|
25
+ `ocropus book2pages #{tmp}/out #{doc.file}`
26
+ `ocropus pages2lines #{tmp}/out`
27
+ `ocropus lines2fsts #{tmp}/out`
28
+ `ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
29
+ doc.set :file, "#{tmp}/output.html"
30
+ doc = doc.read(:html)
31
+ doc.set :file, f
32
+ doc.set :format, :image
40
33
  end
41
34
  end
35
+
36
+ options[:silent] ?
37
+ silence_stdout { read.call(document) } :
38
+ read.call(document)
39
+
40
+ document
41
+
42
42
  end
43
- end
43
+
44
+ end
@@ -1,50 +1,64 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # A reader for the ODT (Open Office) document format.
5
- #
6
- # Based on work by Mark Watson, licensed under the GPL.
7
- # Original project website: http://www.markwatson.com/opensource/
8
- class Odt
9
- # Require the 'zip' gem to unarchive the ODT files
10
- silence_warnings { require 'zip' }
11
- # Build an entity from an ODT file.
12
- def self.read(document, options = {})
13
- f = nil
14
- Zip::ZipFile.open(document.file, Zip::ZipFile::CREATE) do |zipfile|
15
- f = zipfile.read('content.xml')
16
- end
17
- raise "Couldn't unzip dot file #{document.file}!" unless f
18
- xml_h = OOXmlHandler.new
19
- REXML::Document.parse_stream(f, xml_h)
20
- document << Treat::Entities::Entity.from_string(xml_h.plain_text)
21
- document
22
- end
23
- # Xml listener for the parsing of the ODT file.
24
- class OOXmlHandler
25
- require 'rexml/document'
26
- require 'rexml/streamlistener'
27
- include REXML::StreamListener
28
- attr_reader :plain_text
29
- def initialize
30
- @plain_text = ""
31
- @last_name = ""
32
- end
33
- def tag_start(name, attrs)
34
- @last_name = name
35
- end
36
- def text(s)
37
- if @last_name.index('text')
38
- s = s.strip
39
- if s.length > 0
40
- @plain_text << s
41
- @plain_text << "\n\n"
42
- end
43
- end
44
- end
1
+ # A reader for the ODT (Open Office)
2
+ # document format.
3
+ #
4
+ # Based on work by Mark Watson,
5
+ # licensed under the GPL.
6
+ #
7
+ # Original project website:
8
+ # http://www.markwatson.com/opensource/
9
+ #
10
+ # Todo: reimplement with Nokogiri and use
11
+ # XML node information to better translate
12
+ # the format of the text.
13
+ class Treat::Formatters::Readers::ODT
14
+
15
+ # Require the 'zip' gem to unarchive the ODT files
16
+ silence_warnings { require 'zip' }
17
+
18
+ # Extract the readable text from an ODT file.
19
+ #
20
+ # Options: none.
21
+ def self.read(document, options = {})
22
+ f = nil
23
+ Zip::ZipFile.open(document.file,
24
+ Zip::ZipFile::CREATE) do |zipfile|
25
+ f = zipfile.read('content.xml')
26
+ end
27
+ raise "Couldn't unzip dot file " +
28
+ "#{document.file}!" unless f
29
+ xml_h = ODTXmlHandler.new
30
+ REXML::Document.parse_stream(f, xml_h)
31
+
32
+ document.value = xml_h.plain_text
33
+ document.set :format, :odt_office
34
+ document
35
+
36
+ end
37
+
38
+ # Xml listener for the parsing of the ODT file.
39
+ class ODTXmlHandler
40
+ silence_warnings do
41
+ require 'rexml/document'
42
+ require 'rexml/streamlistener'
43
+ end
44
+ include REXML::StreamListener
45
+ attr_reader :plain_text
46
+ def initialize
47
+ @plain_text = ""
48
+ @last_name = ""
49
+ end
50
+ def tag_start(name, attrs)
51
+ @last_name = name
52
+ end
53
+ def text(s)
54
+ if @last_name.index('text')
55
+ s = s.strip
56
+ if s.length > 0
57
+ @plain_text << s
58
+ @plain_text << "\n\n"
45
59
  end
46
60
  end
47
-
48
61
  end
49
62
  end
63
+
50
64
  end