treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/formatters.rb
CHANGED
@@ -1,37 +1,41 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
self.targets = [:collection, :document]
|
11
|
-
self.default = :autoselect
|
12
|
-
end
|
13
|
-
# Serializers transform entities into a storable format.
|
14
|
-
module Serializers
|
15
|
-
extend Group
|
16
|
-
self.type = :computer
|
17
|
-
self.targets = [:entity]
|
18
|
-
self.default = :yaml
|
19
|
-
end
|
20
|
-
# Unserializers recreate entities from a serialized format.
|
21
|
-
module Unserializers
|
22
|
-
extend Group
|
23
|
-
self.type = :transformer
|
24
|
-
self.targets = [:collection, :document]
|
25
|
-
self.default = :autoselect
|
26
|
-
end
|
27
|
-
# Visualizers transform entities into a visualizable format.
|
28
|
-
module Visualizers
|
29
|
-
extend Group
|
30
|
-
self.type = :computer
|
31
|
-
self.targets = [:entity]
|
32
|
-
self.default = :tree
|
33
|
-
end
|
34
|
-
extend Treat::Category
|
1
|
+
# Formatters handle conversion of Entities to and from
|
2
|
+
# external file formats.
|
3
|
+
module Treat::Formatters
|
4
|
+
|
5
|
+
# Readers read a document's content.
|
6
|
+
module Readers
|
7
|
+
extend Treat::Groupable
|
8
|
+
self.type = :computer
|
9
|
+
self.targets = [:document]
|
35
10
|
end
|
11
|
+
|
12
|
+
# Unserializers recreate entities
|
13
|
+
# from a serialized format.
|
14
|
+
module Unserializers
|
15
|
+
extend Treat::Groupable
|
16
|
+
self.type = :computer
|
17
|
+
self.targets = [:entity]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Serializers transform entities
|
21
|
+
# into a storable format.
|
22
|
+
module Serializers
|
23
|
+
extend Treat::Groupable
|
24
|
+
self.type = :computer
|
25
|
+
self.targets = [:entity]
|
26
|
+
self.default = :yaml
|
27
|
+
end
|
28
|
+
|
29
|
+
# Visualizers transform entities
|
30
|
+
# into a visualizable format.
|
31
|
+
module Visualizers
|
32
|
+
extend Treat::Groupable
|
33
|
+
self.type = :computer
|
34
|
+
self.targets = [:entity]
|
35
|
+
self.default = :tree
|
36
|
+
end
|
37
|
+
|
38
|
+
# Make Formatters categorizable.
|
39
|
+
extend Treat::Categorizable
|
40
|
+
|
36
41
|
end
|
37
|
-
|
@@ -1,33 +1,53 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
1
|
+
# A wrapper for a small utility written
|
2
|
+
# by Mark Watson to read AbiWord files.
|
3
|
+
# Released under the GPL.
|
4
|
+
#
|
5
|
+
# Original project website:
|
6
|
+
# http://www.markwatson.com/opensource/
|
7
|
+
#
|
8
|
+
# Todo: reimplement with Nokogiri and use
|
9
|
+
# XML node information to better translate
|
10
|
+
# the format of the text.
|
11
|
+
class Treat::Formatters::Readers::ABW
|
12
|
+
|
13
|
+
silence_warnings do
|
14
|
+
require 'rexml/document'
|
15
|
+
require 'rexml/streamlistener'
|
16
|
+
end
|
17
|
+
|
18
|
+
# Extract the readable text from an AbiWord file.
|
19
|
+
#
|
20
|
+
# Options: none.
|
21
|
+
def self.read(document, options = {})
|
22
|
+
|
23
|
+
xml_h = ABWXmlHandler.new
|
24
|
+
REXML::Document.parse_stream(
|
25
|
+
IO.read(document.file), xml_h)
|
26
|
+
|
27
|
+
document.value = xml_h.plain_text
|
28
|
+
document.set :format, :abw_word
|
29
|
+
document
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
# Helper class to parse the AbiWord file.
|
34
|
+
class ABWXmlHandler
|
35
|
+
include REXML::StreamListener
|
36
|
+
attr_reader :plain_text
|
37
|
+
def initialize
|
38
|
+
@plain_text = ""
|
39
|
+
end
|
40
|
+
def text(s)
|
41
|
+
if s != 'AbiWord' && s !=
|
42
|
+
'application/x-abiword'
|
43
|
+
s.strip!
|
44
|
+
if s.length > 0
|
45
|
+
s += ' '
|
46
|
+
s += "\n\n" if s.length < 45
|
29
47
|
end
|
48
|
+
@plain_text << s
|
30
49
|
end
|
31
50
|
end
|
32
51
|
end
|
33
|
-
|
52
|
+
|
53
|
+
end
|
@@ -1,35 +1,39 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
# - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
|
17
|
-
def self.read(document, options)
|
18
|
-
ext = document.file.split('.')[-1]
|
19
|
-
reader = ImageExtensions.include?(ext) ? 'image' : ext
|
20
|
-
reader = 'html' if reader == 'htm'
|
21
|
-
reader = 'yaml' if reader == 'yml'
|
22
|
-
begin
|
23
|
-
r = Treat::Formatters::Readers.const_get(cc(reader))
|
24
|
-
rescue NameError
|
25
|
-
raise Treat::Exception,
|
26
|
-
"Cannot find a reader for format: '#{ext}'."
|
27
|
-
end
|
28
|
-
document = r.read(document, options)
|
29
|
-
document.set :encoding, document.to_s.encoding.to_s.downcase
|
30
|
-
document
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
1
|
+
class Treat::Formatters::Readers::Autoselect
|
2
|
+
|
3
|
+
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
|
4
|
+
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
5
|
+
DefaultOptions = {
|
6
|
+
:default_to => :txt
|
7
|
+
}
|
8
|
+
|
9
|
+
# Choose a reader to use.
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# - (Symbol) :default_to => format to default to.
|
13
|
+
def self.read(document, options = {})
|
14
|
+
options = DefaultOptions.merge(options)
|
15
|
+
document.read(detect_format(document.file, options[:default_to]))
|
34
16
|
end
|
17
|
+
|
18
|
+
def self.detect_format(filename, default_to = DefaultOptions[:default_to])
|
19
|
+
|
20
|
+
ext = filename.scan(ExtensionRegexp)
|
21
|
+
ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
|
22
|
+
ext[0][0] : ''
|
23
|
+
|
24
|
+
format =
|
25
|
+
ImageExtensions.include?(ext) ?
|
26
|
+
'image' : ext
|
27
|
+
|
28
|
+
# Humanize extensions.
|
29
|
+
format = 'html' if format == 'htm'
|
30
|
+
format = 'yaml' if format == 'yml'
|
31
|
+
|
32
|
+
format = default_to if format == ''
|
33
|
+
|
34
|
+
format.intern
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
35
39
|
end
|
@@ -1,15 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
1
|
+
# A wrapper for the 'antiword' command-line utility.
|
2
|
+
class Treat::Formatters::Readers::DOC
|
3
|
+
|
4
|
+
# Extract the readable text from a DOC file
|
5
|
+
# using the antiword command-line utility.
|
6
|
+
#
|
7
|
+
# Options: none.
|
8
|
+
def self.read(document, options = {})
|
9
|
+
|
10
|
+
f = `antiword #{document.file}`
|
11
|
+
f.gsub!("\n\n", '#keep#')
|
12
|
+
f.gsub!("\n", ' ')
|
13
|
+
f.gsub!('#keep#', "\n\n")
|
14
|
+
|
15
|
+
document.value = f
|
16
|
+
document.set :format, :doc
|
17
|
+
document
|
18
|
+
|
14
19
|
end
|
20
|
+
|
15
21
|
end
|
@@ -1,33 +1,55 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
1
|
+
# This class is a wrapper for the 'ruby-readability'
|
2
|
+
# gem, which extracts the primary readable content
|
3
|
+
# of a web page by using set of handwritten rules.
|
4
|
+
#
|
5
|
+
# Project homepage:
|
6
|
+
# https://github.com/iterationlabs/ruby-readability
|
7
|
+
class Treat::Formatters::Readers::HTML
|
8
|
+
|
9
|
+
silence_warnings { require 'ruby-readability' }
|
10
|
+
|
11
|
+
# By default, don't backup the original HTML
|
12
|
+
DefaultOptions = {
|
13
|
+
:keep_html => false,
|
14
|
+
:tags => %w[p div h1 h2 h3 ul ol dl dt li]
|
15
|
+
}
|
16
|
+
|
17
|
+
# Read the HTML document and strip it of its markup.
|
18
|
+
#
|
19
|
+
# Options:
|
20
|
+
#
|
21
|
+
# text when cleaning the document (default: false).
|
22
|
+
# - (Boolean) :remove_empty_nodes => remove <p> tags
|
23
|
+
# that have no text content
|
24
|
+
# - (String) :encoding => if the page is of a known
|
25
|
+
# encoding, you can specify it; if left unspecified,
|
26
|
+
# the encoding will be guessed (only in Ruby 1.9.x)
|
27
|
+
# - (String) :html_headers => in Ruby 1.9.x these will
|
28
|
+
# be passed to the guess_html_encoding gem to aid with
|
29
|
+
# guessing the HTML encoding.
|
30
|
+
# - (Array of String) :tags => the base whitelist of
|
31
|
+
# tags to sanitize, defaults to %w[div p].
|
32
|
+
# also removes p tags that contain only images
|
33
|
+
# - (Array of String) :attributes => list allowed attributes
|
34
|
+
# - (Array of String) :ignore_image_format => for use with images.
|
35
|
+
# - (Numeric) :min_image_height => minimum image height for images.
|
36
|
+
# - (Numeric) :min_image_width => minimum image width for images.
|
37
|
+
def self.read(document, options = {})
|
38
|
+
|
39
|
+
# set encoding with the guess_html_encoding
|
40
|
+
options = DefaultOptions.merge(options)
|
41
|
+
html = File.read(document.file)
|
42
|
+
|
43
|
+
silence_warnings do
|
44
|
+
# Strip comments
|
45
|
+
html.gsub!(/<!--[^>]*-->/m, '')
|
46
|
+
d = Readability::Document.new(html, options)
|
47
|
+
document.value = "<h1>#{d.title}</h1>\n" + d.content
|
48
|
+
document.set :format, :html
|
31
49
|
end
|
50
|
+
|
51
|
+
document
|
52
|
+
|
32
53
|
end
|
54
|
+
|
33
55
|
end
|
@@ -1,43 +1,44 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
end
|
34
|
-
end
|
35
|
-
options[:silent] ?
|
36
|
-
silence_stdout { read.call(document) } :
|
37
|
-
read.call(document)
|
38
|
-
document
|
39
|
-
end
|
1
|
+
# This class is a wrapper for the Google Ocropus
|
2
|
+
# optical character recognition (OCR) engine.
|
3
|
+
#
|
4
|
+
# "OCRopus(tm) is a state-of-the-art document
|
5
|
+
# analysis and OCR system, featuring pluggable
|
6
|
+
# layout analysis, pluggable character recognition,
|
7
|
+
# statistical natural language modeling, and multi-
|
8
|
+
# lingual capabilities."
|
9
|
+
#
|
10
|
+
# Original paper:
|
11
|
+
#
|
12
|
+
# Breuel, Thomas M. The Ocropus Open Source OCR System.
|
13
|
+
# DFKI and U. Kaiserslautern, Germany.
|
14
|
+
class Treat::Formatters::Readers::Image
|
15
|
+
|
16
|
+
# Read a file using the Google Ocropus reader.
|
17
|
+
#
|
18
|
+
# Options:
|
19
|
+
#
|
20
|
+
# - (Boolean) :silent => whether to silence Ocropus.
|
21
|
+
def self.read(document, options = {})
|
22
|
+
|
23
|
+
read = lambda do |doc|
|
24
|
+
create_temp_dir do |tmp|
|
25
|
+
`ocropus book2pages #{tmp}/out #{doc.file}`
|
26
|
+
`ocropus pages2lines #{tmp}/out`
|
27
|
+
`ocropus lines2fsts #{tmp}/out`
|
28
|
+
`ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
|
29
|
+
doc.set :file, "#{tmp}/output.html"
|
30
|
+
doc = doc.read(:html)
|
31
|
+
doc.set :file, f
|
32
|
+
doc.set :format, :image
|
40
33
|
end
|
41
34
|
end
|
35
|
+
|
36
|
+
options[:silent] ?
|
37
|
+
silence_stdout { read.call(document) } :
|
38
|
+
read.call(document)
|
39
|
+
|
40
|
+
document
|
41
|
+
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
|
+
end
|
@@ -1,50 +1,64 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
1
|
+
# A reader for the ODT (Open Office)
|
2
|
+
# document format.
|
3
|
+
#
|
4
|
+
# Based on work by Mark Watson,
|
5
|
+
# licensed under the GPL.
|
6
|
+
#
|
7
|
+
# Original project website:
|
8
|
+
# http://www.markwatson.com/opensource/
|
9
|
+
#
|
10
|
+
# Todo: reimplement with Nokogiri and use
|
11
|
+
# XML node information to better translate
|
12
|
+
# the format of the text.
|
13
|
+
class Treat::Formatters::Readers::ODT
|
14
|
+
|
15
|
+
# Require the 'zip' gem to unarchive the ODT files
|
16
|
+
silence_warnings { require 'zip' }
|
17
|
+
|
18
|
+
# Extract the readable text from an ODT file.
|
19
|
+
#
|
20
|
+
# Options: none.
|
21
|
+
def self.read(document, options = {})
|
22
|
+
f = nil
|
23
|
+
Zip::ZipFile.open(document.file,
|
24
|
+
Zip::ZipFile::CREATE) do |zipfile|
|
25
|
+
f = zipfile.read('content.xml')
|
26
|
+
end
|
27
|
+
raise "Couldn't unzip dot file " +
|
28
|
+
"#{document.file}!" unless f
|
29
|
+
xml_h = ODTXmlHandler.new
|
30
|
+
REXML::Document.parse_stream(f, xml_h)
|
31
|
+
|
32
|
+
document.value = xml_h.plain_text
|
33
|
+
document.set :format, :odt_office
|
34
|
+
document
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
# Xml listener for the parsing of the ODT file.
|
39
|
+
class ODTXmlHandler
|
40
|
+
silence_warnings do
|
41
|
+
require 'rexml/document'
|
42
|
+
require 'rexml/streamlistener'
|
43
|
+
end
|
44
|
+
include REXML::StreamListener
|
45
|
+
attr_reader :plain_text
|
46
|
+
def initialize
|
47
|
+
@plain_text = ""
|
48
|
+
@last_name = ""
|
49
|
+
end
|
50
|
+
def tag_start(name, attrs)
|
51
|
+
@last_name = name
|
52
|
+
end
|
53
|
+
def text(s)
|
54
|
+
if @last_name.index('text')
|
55
|
+
s = s.strip
|
56
|
+
if s.length > 0
|
57
|
+
@plain_text << s
|
58
|
+
@plain_text << "\n\n"
|
45
59
|
end
|
46
60
|
end
|
47
|
-
|
48
61
|
end
|
49
62
|
end
|
63
|
+
|
50
64
|
end
|