treat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
+
# Experimental algorithm to calculate the transition
|
5
|
+
# probability of an observed word.
|
4
6
|
class TransitionProbability
|
5
7
|
|
6
8
|
# Find the transition probability.
|
@@ -18,14 +20,16 @@ module Treat
|
|
18
20
|
next unless tm[f1][v1]
|
19
21
|
|
20
22
|
relationships.each do |relationship|
|
21
|
-
relatives =
|
23
|
+
relatives = entity.send(relationship)
|
22
24
|
relatives = [relatives] unless relatives.is_a? Array
|
23
25
|
relatives.each do |relative|
|
24
26
|
next if relative.nil? || !relative.has?(f2)
|
25
27
|
v2 = relative.send(f2)
|
26
|
-
if tm[f1][v1][relationship]
|
27
|
-
|
28
|
-
|
28
|
+
if tm[f1][v1][relationship] &&
|
29
|
+
tm[f1][v1][relationship][f2] &&
|
30
|
+
tm[f1][v1][relationship][f2][v2]
|
31
|
+
score += tm[f1][v1][relationship][f2][v2]
|
32
|
+
count += 1
|
29
33
|
end
|
30
34
|
end
|
31
35
|
end
|
@@ -1,8 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
+
# A wrapper for the 'chronic' gem, which parses
|
5
|
+
# time and date information.
|
6
|
+
#
|
7
|
+
# Project website: http://chronic.rubyforge.org/
|
4
8
|
class Chronic
|
5
9
|
silence_warnings { require 'chronic' }
|
10
|
+
# Return the time information contained within the entity
|
11
|
+
# by parsing it with the 'chronic' gem.
|
12
|
+
#
|
13
|
+
# Options: none.
|
6
14
|
def self.time(entity, options = {})
|
7
15
|
silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
|
8
16
|
end
|
@@ -1,8 +1,14 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
+
# A wrapper for Ruby's native date/time parsing.
|
4
5
|
module Native
|
5
6
|
require 'date'
|
7
|
+
# Return a DateTime object representing the date/time
|
8
|
+
# contained within the entity, using Ruby's native
|
9
|
+
# date/time parser.
|
10
|
+
#
|
11
|
+
# Options: none.
|
6
12
|
def self.time(entity, options = {})
|
7
13
|
::DateTime.parse(entity.to_s)
|
8
14
|
end
|
@@ -1,45 +1,53 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
4
|
+
# A wrapper for the 'nickel' gem, which parses
|
5
|
+
# times and dates and supplies additional information
|
6
|
+
# concerning these. The additional information supplied
|
7
|
+
# that this class annotates entities with is:
|
8
|
+
#
|
9
|
+
# - time_recurrence: frequency of recurrence in words*.
|
10
|
+
# - time_recurrence_interval: frequency of recurrence in days.
|
11
|
+
# - start_time: a DateTime object representing the beginning of
|
12
|
+
# an event.
|
13
|
+
# - end_time: a DateTime object representing the end of an event.
|
14
|
+
#
|
15
|
+
# Examples of values for time_recurrence are:
|
16
|
+
#
|
17
|
+
# - single: "lunch with megan tomorrow at noon"
|
18
|
+
# - daily: "Art exhibit until March 1st"
|
19
|
+
# - weekly: "math class every wed from 8-11am"
|
20
|
+
# - daymonthly: "open bar at joes the first friday of every month"
|
21
|
+
# - datemonthly: "pay credit card bill on the 22nd of each month"
|
22
|
+
#
|
23
|
+
# Project website: http://naturalinputs.com/
|
16
24
|
module Nickel
|
17
25
|
require 'date'
|
18
26
|
silence_warnings { require 'nickel' }
|
27
|
+
# Extract time information from a bit of text.
|
19
28
|
def self.time(entity, options = {})
|
20
29
|
n = silence_warnings { ::Nickel.parse(entity.to_s) }
|
21
30
|
occ = n.occurrences[0]
|
22
|
-
|
31
|
+
|
23
32
|
rec = occ.type.to_s.gsub('single', 'once').intern
|
24
33
|
entity.set :time_recurrence, rec
|
25
|
-
interval = occ.interval ? occ.interval
|
34
|
+
interval = occ.interval ? occ.interval : :none
|
26
35
|
entity.set :time_recurrence_interval, interval
|
27
|
-
|
36
|
+
|
28
37
|
s = [occ.start_date, occ.start_time]
|
29
38
|
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
30
|
-
ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
|
31
|
-
|
39
|
+
#ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
|
40
|
+
|
32
41
|
e = [occ.end_date, occ.end_time]
|
33
42
|
de = [e[0].year, e[0].month, e[0].day] if e[0]
|
34
|
-
te = [e[1].hour, e[1].min, e[1].sec] if e[1]
|
35
|
-
|
36
|
-
entity.set :start_time, ::DateTime.civil(*ds
|
37
|
-
entity.set :end_time, ::DateTime.civil(*de
|
38
|
-
|
43
|
+
#te = [e[1].hour, e[1].min, e[1].sec] if e[1]
|
44
|
+
|
45
|
+
entity.set :start_time, ::DateTime.civil(*ds) if ds
|
46
|
+
entity.set :end_time, ::DateTime.civil(*de) if de
|
47
|
+
|
39
48
|
entity.start_time
|
40
49
|
end
|
41
50
|
end
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
45
|
-
|
@@ -9,6 +9,8 @@ module Treat
|
|
9
9
|
# Blei, David M., Ng, Andrew Y., and Jordan, Michael
|
10
10
|
# I. 2003. Latent dirichlet allocation. Journal of
|
11
11
|
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
12
|
+
#
|
13
|
+
# Project website: https://github.com/ealdent/lda-ruby
|
12
14
|
class LDA
|
13
15
|
# Require the lda-ruby gem.
|
14
16
|
silence_warnings { require 'lda-ruby' }
|
@@ -17,25 +19,28 @@ module Treat
|
|
17
19
|
Lda::TextCorpus.class_eval do
|
18
20
|
# Ruby, Y U NO SHUT UP!
|
19
21
|
silence_warnings { undef :initialize }
|
20
|
-
# Redefine initialize to take in an array of
|
21
|
-
def initialize(
|
22
|
+
# Redefine initialize to take in an array of sections
|
23
|
+
def initialize(sections)
|
22
24
|
super(nil)
|
23
|
-
|
24
|
-
add_document(Lda::TextDocument.new(self,
|
25
|
+
sections.each do |section|
|
26
|
+
add_document(Lda::TextDocument.new(self, section))
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
30
|
+
# Default options for the LDA algorithm.
|
31
|
+
DefaultOptions = {
|
32
|
+
topics: 20,
|
33
|
+
words_per_topic: 10,
|
34
|
+
iterations: 20
|
35
|
+
}
|
36
|
+
# Retrieve the topic words of a collection.
|
28
37
|
def self.topic_words(collection, options = {})
|
29
|
-
|
30
|
-
options[:words_per_topic] ||= 10
|
31
|
-
options[:topics] ||= 20
|
32
|
-
options[:iterations] ||= 20
|
33
|
-
|
38
|
+
options = DefaultOptions.merge(options)
|
34
39
|
# Create a corpus with the collection
|
35
|
-
|
40
|
+
sections = collection.sections.collect do |t|
|
36
41
|
t.to_s.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
37
42
|
end
|
38
|
-
corpus = Lda::TextCorpus.new(
|
43
|
+
corpus = Lda::TextCorpus.new(sections)
|
39
44
|
|
40
45
|
# Create an Lda object for training
|
41
46
|
lda = Lda::Lda.new(corpus)
|
@@ -43,15 +48,15 @@ module Treat
|
|
43
48
|
lda.max_iter = options[:iterations]
|
44
49
|
# Run the EM algorithm using random starting points
|
45
50
|
silence_streams(STDOUT, STDERR) { lda.em('random') }
|
46
|
-
|
51
|
+
|
47
52
|
# Load the vocabulary.
|
48
53
|
if options[:vocabulary]
|
49
54
|
lda.load_vocabulary(options[:vocabulary])
|
50
55
|
end
|
51
|
-
|
52
|
-
# Get the topic words and annotate the
|
56
|
+
|
57
|
+
# Get the topic words and annotate the section.
|
53
58
|
topic_words = lda.top_words(options[:words_per_topic])
|
54
|
-
|
59
|
+
|
55
60
|
topic_words.each do |i, words|
|
56
61
|
collection.each_word do |word|
|
57
62
|
if words.include?(word)
|
@@ -62,7 +67,7 @@ module Treat
|
|
62
67
|
end
|
63
68
|
end
|
64
69
|
end
|
65
|
-
|
70
|
+
|
66
71
|
topic_words
|
67
72
|
end
|
68
73
|
end
|
@@ -6,9 +6,9 @@ module Treat
|
|
6
6
|
#
|
7
7
|
# Copyright 2005 Mark Watson. All rights reserved.
|
8
8
|
# This software is released under the GPL.
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
9
|
+
# Rewrite for inclusion in Treat by Louis Mullie (2011).
|
10
|
+
#
|
11
|
+
# Original project website: http://www.markwatson.com/opensource/
|
12
12
|
class Reuters
|
13
13
|
# Require the Nokogiri XML parser.
|
14
14
|
require 'nokogiri'
|
@@ -17,6 +17,8 @@ module Treat
|
|
17
17
|
@@region = {}
|
18
18
|
@@topics = {}
|
19
19
|
# Get the topic of the text.
|
20
|
+
#
|
21
|
+
# Options: none.
|
20
22
|
def self.topics(text, options = {})
|
21
23
|
stems = []
|
22
24
|
@@reduce = 0
|
@@ -33,7 +35,7 @@ module Treat
|
|
33
35
|
topics = score_words(@@industry, stems)
|
34
36
|
topics = topics.merge(score_words(@@region, stems))
|
35
37
|
topics = topics.merge(score_words(@@topics, stems))
|
36
|
-
Treat::Feature.new(topics)
|
38
|
+
#Treat::Feature.new(topics)
|
37
39
|
end
|
38
40
|
# Read the topics from the XML files.
|
39
41
|
def self.get_topics
|
data/lib/treat/extractors.rb
CHANGED
@@ -6,19 +6,19 @@ module Treat
|
|
6
6
|
module Time
|
7
7
|
extend Group
|
8
8
|
self.type = :annotator
|
9
|
-
self.targets = [:word, :constituent, :symbol]
|
9
|
+
self.targets = [:sentence, :word, :constituent, :symbol]
|
10
10
|
end
|
11
11
|
# Extract the topic from a text.
|
12
12
|
module Topics
|
13
13
|
extend Group
|
14
14
|
self.type = :annotator
|
15
|
-
self.targets = [:collection, :document, :
|
15
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
16
16
|
end
|
17
17
|
# Extract the topic from a text.
|
18
18
|
module TopicWords
|
19
19
|
extend Group
|
20
20
|
self.type = :annotator
|
21
|
-
self.targets = [:collection, :document, :
|
21
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
22
22
|
end
|
23
23
|
# Extract named entities from texts.
|
24
24
|
module NamedEntity
|
@@ -27,15 +27,15 @@ module Treat
|
|
27
27
|
self.targets = [:entity]
|
28
28
|
end
|
29
29
|
# Extract the key sentences from a text.
|
30
|
-
module
|
30
|
+
module Keywords
|
31
31
|
extend Group
|
32
|
-
self.type = :
|
33
|
-
self.targets = [:collection, :document, :
|
32
|
+
self.type = :annotator
|
33
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
34
34
|
end
|
35
35
|
# This module should be moved out of here ASAP.
|
36
36
|
module Statistics
|
37
37
|
extend Group
|
38
|
-
self.type = :
|
38
|
+
self.type = :annotator
|
39
39
|
self.targets = [:entity]
|
40
40
|
self.default = :none
|
41
41
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
class Abw
|
5
|
+
require 'rexml/document'
|
6
|
+
require 'rexml/streamlistener'
|
7
|
+
def self.read(document, options = {})
|
8
|
+
xml_h = AbiWordXmlHandler.new(
|
9
|
+
REXML::Document.parse_stream((IO.read(document.file)), xml_h))
|
10
|
+
document << xml_h.plain_text
|
11
|
+
document
|
12
|
+
end
|
13
|
+
class AbiWordXmlHandler
|
14
|
+
include REXML::StreamListener
|
15
|
+
attr_reader :plain_text
|
16
|
+
def initialize
|
17
|
+
@plain_text = ""
|
18
|
+
end
|
19
|
+
def text s
|
20
|
+
begin
|
21
|
+
s = s.strip
|
22
|
+
if s.length > 0
|
23
|
+
@plain_text << s
|
24
|
+
@plain_text << "\n"
|
25
|
+
end
|
26
|
+
end if s != 'AbiWord' && s != 'application/x-abiword'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -6,26 +6,28 @@ module Treat
|
|
6
6
|
# the appropriate reader based on the file
|
7
7
|
# extension of the supplied document.
|
8
8
|
class Autoselect
|
9
|
-
# A list of image extensions that should be routed
|
10
|
-
# to the Ocropus OCR engine.
|
9
|
+
# A list of image extensions that should be routed to OCR.
|
11
10
|
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
11
|
+
# Default options.
|
12
|
+
DefaultOptions = {:ocr => :ocropus}
|
12
13
|
# Select the appropriate reader based on the format
|
13
14
|
# of the filename in document.
|
14
15
|
#
|
15
16
|
# Options:
|
16
|
-
#
|
17
|
-
|
17
|
+
#
|
18
|
+
# - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
|
19
|
+
def self.read(document, options)
|
20
|
+
options = DefaultOptions.merge(options)
|
18
21
|
ext = document.file.split('.')[-1]
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
reader = ext
|
23
|
-
end
|
22
|
+
reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
|
23
|
+
reader = 'html' if reader == 'htm'
|
24
|
+
reader = 'yaml' if reader == 'yml'
|
24
25
|
begin
|
25
26
|
r = Treat::Formatters::Readers.const_get(cc(reader))
|
26
|
-
rescue NameError
|
27
|
+
rescue NameError => e
|
28
|
+
puts e.message
|
27
29
|
raise Treat::Exception,
|
28
|
-
"Cannot find a
|
30
|
+
"Cannot find a reader for format: '#{ext}'."
|
29
31
|
end
|
30
32
|
document = r.read(document, options)
|
31
33
|
end
|
@@ -1,11 +1,31 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Readers
|
4
|
+
# A temporary HTML reader; simply strips the
|
5
|
+
# document of all of its markup.
|
4
6
|
class HTML
|
7
|
+
# Require Hpricot.
|
8
|
+
silence_warnings { require 'hpricot' }
|
9
|
+
# By default, backup the HTML text while cleaning.
|
10
|
+
DefaultOptions = { clean: true, backup: false }
|
11
|
+
# Read the HTML document and strip it of its markup.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
#
|
15
|
+
# - (Boolean) :clean => whether to strip HTML markup.
|
16
|
+
# - (Boolean) :backup => whether to backup the HTML
|
17
|
+
# markup while cleaning.
|
5
18
|
def self.read(document, options = {})
|
19
|
+
options = DefaultOptions.merge(options)
|
6
20
|
f = File.read(document.file)
|
7
21
|
document << Treat::Entities::Entity.from_string(f)
|
8
|
-
|
22
|
+
if options[:clean]
|
23
|
+
document.each do |section|
|
24
|
+
section.set :html_value, section.value if options[:backup]
|
25
|
+
section.value = Hpricot(section.value).inner_text
|
26
|
+
end
|
27
|
+
end
|
28
|
+
document
|
9
29
|
end
|
10
30
|
end
|
11
31
|
end
|
@@ -15,11 +15,11 @@ module Treat
|
|
15
15
|
# DFKI and U. Kaiserslautern, Germany.
|
16
16
|
class Ocropus
|
17
17
|
# Read a file using the Google Ocropus reader.
|
18
|
+
#
|
19
|
+
# Options: none.
|
18
20
|
def self.read(document, options = {})
|
19
21
|
create_temp_file(:txt) do |tmp|
|
20
|
-
|
21
|
-
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
22
|
-
end
|
22
|
+
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
23
23
|
f = File.read(tmp)
|
24
24
|
document << Treat::Entities::Entity.from_string(f)
|
25
25
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
class Odt
|
5
|
+
# Build an entity from a string in plain text format.
|
6
|
+
def self.read(document, options = {})
|
7
|
+
f = File.read(document.file)
|
8
|
+
f = f.force_encoding("UTF-8")
|
9
|
+
xml_h = OOXmlHandler.new(
|
10
|
+
REXML::Document.parse_stream(f, xml_h)
|
11
|
+
)
|
12
|
+
document << xml_h.plain_text
|
13
|
+
document
|
14
|
+
end
|
15
|
+
|
16
|
+
class OOXmlHandler
|
17
|
+
require 'rexml/document'
|
18
|
+
require 'rexml/streamlistener'
|
19
|
+
include REXML::StreamListener
|
20
|
+
attr_reader :plain_text
|
21
|
+
def initialize
|
22
|
+
@plain_text = ""
|
23
|
+
end
|
24
|
+
def tag_start(name, attrs)
|
25
|
+
@last_name = name
|
26
|
+
end
|
27
|
+
def text(s)
|
28
|
+
if @last_name.index('text')
|
29
|
+
s = s.strip
|
30
|
+
if s.length > 0
|
31
|
+
@plain_text << s
|
32
|
+
@plain_text << "\n"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -1,9 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Readers
|
4
|
+
# A wrapper for the Poppler pdf2text utility, which
|
5
|
+
# extracts the text from a PDF file.
|
4
6
|
class PDF
|
5
|
-
|
6
|
-
#
|
7
|
+
# Read a PDF file using the Poppler pdf2text utility.
|
8
|
+
#
|
9
|
+
# Options: none.
|
7
10
|
def self.read(document, options = {})
|
8
11
|
create_temp_file(:txt) do |tmp|
|
9
12
|
`pdftotext #{document.file} #{tmp} `.strip
|
@@ -4,6 +4,8 @@ module Treat
|
|
4
4
|
# This class simply reads a plain text file.
|
5
5
|
class Txt
|
6
6
|
# Build an entity from a string in plain text format.
|
7
|
+
#
|
8
|
+
# Options: none.
|
7
9
|
def self.read(document, options = {})
|
8
10
|
f = File.read(document.file)
|
9
11
|
document << Treat::Entities::Entity.from_string(f)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Serializers
|
4
|
-
# This class converts an entity to XML format.
|
4
|
+
# This class converts an entity to a storable XML format.
|
5
5
|
class XML
|
6
6
|
# Reauire the Nokogiri XML parser.
|
7
7
|
require 'nokogiri'
|
@@ -9,7 +9,8 @@ module Treat
|
|
9
9
|
def self.serialize(entity, options = {})
|
10
10
|
options = {:indent => 0} if options[:indent].nil?
|
11
11
|
if options[:indent] == 0
|
12
|
-
|
12
|
+
enc = entity.encoding(:r_chardet19).to_s.gsub('_', '-').upcase
|
13
|
+
string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>"
|
13
14
|
else
|
14
15
|
string = ''
|
15
16
|
end
|
@@ -1,7 +1,13 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# This class doesn't perform any unserializing;
|
5
|
+
# it simply routes the document to an unserializer
|
6
|
+
# based on the file extension of the document.
|
4
7
|
class Autoselect
|
8
|
+
# Unserialize any supported file format.
|
9
|
+
#
|
10
|
+
# Options: none.
|
5
11
|
def self.unserialize(document, options = {})
|
6
12
|
ext = document.file.split('.')[-1]
|
7
13
|
if ext == 'yaml' || ext == 'yml'
|
@@ -9,7 +15,7 @@ module Treat
|
|
9
15
|
elsif ext == 'xml'
|
10
16
|
document.unserialize(:xml)
|
11
17
|
else
|
12
|
-
raise "File #{document.file} was not recognized"+
|
18
|
+
raise "File #{document.file} was not recognized "+
|
13
19
|
"as a supported serialized format."
|
14
20
|
end
|
15
21
|
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# Recreates the entity tree corresponding to
|
5
|
+
# a serialized XML file.
|
4
6
|
class XML
|
5
7
|
require 'nokogiri'
|
6
|
-
|
8
|
+
# Unserialize an entity stored in XML format.
|
9
|
+
#
|
10
|
+
# Options: none.
|
7
11
|
def self.unserialize(document, options = {})
|
8
12
|
# Read in the XML file.
|
9
13
|
xml = File.read(document.file)
|
@@ -59,6 +63,7 @@ module Treat
|
|
59
63
|
current_value = xml_reader.value.strip
|
60
64
|
if current_value && current_value != ''
|
61
65
|
current_element.value = current_value
|
66
|
+
current_element.register_token(current_element)
|
62
67
|
end
|
63
68
|
end
|
64
69
|
|
@@ -1,10 +1,14 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# This class is a wrapper for the Psych YAML
|
5
|
+
# parser; it unserializes YAML files.
|
4
6
|
class YAML
|
5
7
|
# Require the Psych YAML parser.
|
6
8
|
require 'psych'
|
7
|
-
# Unserialize a YAML file
|
9
|
+
# Unserialize a YAML file.
|
10
|
+
#
|
11
|
+
# Options: none.
|
8
12
|
def self.unserialize(document, options = {})
|
9
13
|
document << ::Psych.load(File.read(document.file))
|
10
14
|
document
|