treat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
+
# Experimental algorithm to calculate the transition
|
5
|
+
# probability of an observed word.
|
4
6
|
class TransitionProbability
|
5
7
|
|
6
8
|
# Find the transition probability.
|
@@ -18,14 +20,16 @@ module Treat
|
|
18
20
|
next unless tm[f1][v1]
|
19
21
|
|
20
22
|
relationships.each do |relationship|
|
21
|
-
relatives =
|
23
|
+
relatives = entity.send(relationship)
|
22
24
|
relatives = [relatives] unless relatives.is_a? Array
|
23
25
|
relatives.each do |relative|
|
24
26
|
next if relative.nil? || !relative.has?(f2)
|
25
27
|
v2 = relative.send(f2)
|
26
|
-
if tm[f1][v1][relationship]
|
27
|
-
|
28
|
-
|
28
|
+
if tm[f1][v1][relationship] &&
|
29
|
+
tm[f1][v1][relationship][f2] &&
|
30
|
+
tm[f1][v1][relationship][f2][v2]
|
31
|
+
score += tm[f1][v1][relationship][f2][v2]
|
32
|
+
count += 1
|
29
33
|
end
|
30
34
|
end
|
31
35
|
end
|
@@ -1,8 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
+
# A wrapper for the 'chronic' gem, which parses
|
5
|
+
# time and date information.
|
6
|
+
#
|
7
|
+
# Project website: http://chronic.rubyforge.org/
|
4
8
|
class Chronic
|
5
9
|
silence_warnings { require 'chronic' }
|
10
|
+
# Return the time information contained within the entity
|
11
|
+
# by parsing it with the 'chronic' gem.
|
12
|
+
#
|
13
|
+
# Options: none.
|
6
14
|
def self.time(entity, options = {})
|
7
15
|
silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
|
8
16
|
end
|
@@ -1,8 +1,14 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
+
# A wrapper for Ruby's native date/time parsing.
|
4
5
|
module Native
|
5
6
|
require 'date'
|
7
|
+
# Return a DateTime object representing the date/time
|
8
|
+
# contained within the entity, using Ruby's native
|
9
|
+
# date/time parser.
|
10
|
+
#
|
11
|
+
# Options: none.
|
6
12
|
def self.time(entity, options = {})
|
7
13
|
::DateTime.parse(entity.to_s)
|
8
14
|
end
|
@@ -1,45 +1,53 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
4
|
+
# A wrapper for the 'nickel' gem, which parses
|
5
|
+
# times and dates and supplies additional information
|
6
|
+
# concerning these. The additional information supplied
|
7
|
+
# that this class annotates entities with is:
|
8
|
+
#
|
9
|
+
# - time_recurrence: frequency of recurrence in words*.
|
10
|
+
# - time_recurrence_interval: frequency of recurrence in days.
|
11
|
+
# - start_time: a DateTime object representing the beginning of
|
12
|
+
# an event.
|
13
|
+
# - end_time: a DateTime object representing the end of an event.
|
14
|
+
#
|
15
|
+
# Examples of values for time_recurrence are:
|
16
|
+
#
|
17
|
+
# - single: "lunch with megan tomorrow at noon"
|
18
|
+
# - daily: "Art exhibit until March 1st"
|
19
|
+
# - weekly: "math class every wed from 8-11am"
|
20
|
+
# - daymonthly: "open bar at joes the first friday of every month"
|
21
|
+
# - datemonthly: "pay credit card bill on the 22nd of each month"
|
22
|
+
#
|
23
|
+
# Project website: http://naturalinputs.com/
|
16
24
|
module Nickel
|
17
25
|
require 'date'
|
18
26
|
silence_warnings { require 'nickel' }
|
27
|
+
# Extract time information from a bit of text.
|
19
28
|
def self.time(entity, options = {})
|
20
29
|
n = silence_warnings { ::Nickel.parse(entity.to_s) }
|
21
30
|
occ = n.occurrences[0]
|
22
|
-
|
31
|
+
|
23
32
|
rec = occ.type.to_s.gsub('single', 'once').intern
|
24
33
|
entity.set :time_recurrence, rec
|
25
|
-
interval = occ.interval ? occ.interval
|
34
|
+
interval = occ.interval ? occ.interval : :none
|
26
35
|
entity.set :time_recurrence_interval, interval
|
27
|
-
|
36
|
+
|
28
37
|
s = [occ.start_date, occ.start_time]
|
29
38
|
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
30
|
-
ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
|
31
|
-
|
39
|
+
#ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
|
40
|
+
|
32
41
|
e = [occ.end_date, occ.end_time]
|
33
42
|
de = [e[0].year, e[0].month, e[0].day] if e[0]
|
34
|
-
te = [e[1].hour, e[1].min, e[1].sec] if e[1]
|
35
|
-
|
36
|
-
entity.set :start_time, ::DateTime.civil(*ds
|
37
|
-
entity.set :end_time, ::DateTime.civil(*de
|
38
|
-
|
43
|
+
#te = [e[1].hour, e[1].min, e[1].sec] if e[1]
|
44
|
+
|
45
|
+
entity.set :start_time, ::DateTime.civil(*ds) if ds
|
46
|
+
entity.set :end_time, ::DateTime.civil(*de) if de
|
47
|
+
|
39
48
|
entity.start_time
|
40
49
|
end
|
41
50
|
end
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
45
|
-
|
@@ -9,6 +9,8 @@ module Treat
|
|
9
9
|
# Blei, David M., Ng, Andrew Y., and Jordan, Michael
|
10
10
|
# I. 2003. Latent dirichlet allocation. Journal of
|
11
11
|
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
12
|
+
#
|
13
|
+
# Project website: https://github.com/ealdent/lda-ruby
|
12
14
|
class LDA
|
13
15
|
# Require the lda-ruby gem.
|
14
16
|
silence_warnings { require 'lda-ruby' }
|
@@ -17,25 +19,28 @@ module Treat
|
|
17
19
|
Lda::TextCorpus.class_eval do
|
18
20
|
# Ruby, Y U NO SHUT UP!
|
19
21
|
silence_warnings { undef :initialize }
|
20
|
-
# Redefine initialize to take in an array of
|
21
|
-
def initialize(
|
22
|
+
# Redefine initialize to take in an array of sections
|
23
|
+
def initialize(sections)
|
22
24
|
super(nil)
|
23
|
-
|
24
|
-
add_document(Lda::TextDocument.new(self,
|
25
|
+
sections.each do |section|
|
26
|
+
add_document(Lda::TextDocument.new(self, section))
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
30
|
+
# Default options for the LDA algorithm.
|
31
|
+
DefaultOptions = {
|
32
|
+
topics: 20,
|
33
|
+
words_per_topic: 10,
|
34
|
+
iterations: 20
|
35
|
+
}
|
36
|
+
# Retrieve the topic words of a collection.
|
28
37
|
def self.topic_words(collection, options = {})
|
29
|
-
|
30
|
-
options[:words_per_topic] ||= 10
|
31
|
-
options[:topics] ||= 20
|
32
|
-
options[:iterations] ||= 20
|
33
|
-
|
38
|
+
options = DefaultOptions.merge(options)
|
34
39
|
# Create a corpus with the collection
|
35
|
-
|
40
|
+
sections = collection.sections.collect do |t|
|
36
41
|
t.to_s.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
37
42
|
end
|
38
|
-
corpus = Lda::TextCorpus.new(
|
43
|
+
corpus = Lda::TextCorpus.new(sections)
|
39
44
|
|
40
45
|
# Create an Lda object for training
|
41
46
|
lda = Lda::Lda.new(corpus)
|
@@ -43,15 +48,15 @@ module Treat
|
|
43
48
|
lda.max_iter = options[:iterations]
|
44
49
|
# Run the EM algorithm using random starting points
|
45
50
|
silence_streams(STDOUT, STDERR) { lda.em('random') }
|
46
|
-
|
51
|
+
|
47
52
|
# Load the vocabulary.
|
48
53
|
if options[:vocabulary]
|
49
54
|
lda.load_vocabulary(options[:vocabulary])
|
50
55
|
end
|
51
|
-
|
52
|
-
# Get the topic words and annotate the
|
56
|
+
|
57
|
+
# Get the topic words and annotate the section.
|
53
58
|
topic_words = lda.top_words(options[:words_per_topic])
|
54
|
-
|
59
|
+
|
55
60
|
topic_words.each do |i, words|
|
56
61
|
collection.each_word do |word|
|
57
62
|
if words.include?(word)
|
@@ -62,7 +67,7 @@ module Treat
|
|
62
67
|
end
|
63
68
|
end
|
64
69
|
end
|
65
|
-
|
70
|
+
|
66
71
|
topic_words
|
67
72
|
end
|
68
73
|
end
|
@@ -6,9 +6,9 @@ module Treat
|
|
6
6
|
#
|
7
7
|
# Copyright 2005 Mark Watson. All rights reserved.
|
8
8
|
# This software is released under the GPL.
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
9
|
+
# Rewrite for inclusion in Treat by Louis Mullie (2011).
|
10
|
+
#
|
11
|
+
# Original project website: http://www.markwatson.com/opensource/
|
12
12
|
class Reuters
|
13
13
|
# Require the Nokogiri XML parser.
|
14
14
|
require 'nokogiri'
|
@@ -17,6 +17,8 @@ module Treat
|
|
17
17
|
@@region = {}
|
18
18
|
@@topics = {}
|
19
19
|
# Get the topic of the text.
|
20
|
+
#
|
21
|
+
# Options: none.
|
20
22
|
def self.topics(text, options = {})
|
21
23
|
stems = []
|
22
24
|
@@reduce = 0
|
@@ -33,7 +35,7 @@ module Treat
|
|
33
35
|
topics = score_words(@@industry, stems)
|
34
36
|
topics = topics.merge(score_words(@@region, stems))
|
35
37
|
topics = topics.merge(score_words(@@topics, stems))
|
36
|
-
Treat::Feature.new(topics)
|
38
|
+
#Treat::Feature.new(topics)
|
37
39
|
end
|
38
40
|
# Read the topics from the XML files.
|
39
41
|
def self.get_topics
|
data/lib/treat/extractors.rb
CHANGED
@@ -6,19 +6,19 @@ module Treat
|
|
6
6
|
module Time
|
7
7
|
extend Group
|
8
8
|
self.type = :annotator
|
9
|
-
self.targets = [:word, :constituent, :symbol]
|
9
|
+
self.targets = [:sentence, :word, :constituent, :symbol]
|
10
10
|
end
|
11
11
|
# Extract the topic from a text.
|
12
12
|
module Topics
|
13
13
|
extend Group
|
14
14
|
self.type = :annotator
|
15
|
-
self.targets = [:collection, :document, :
|
15
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
16
16
|
end
|
17
17
|
# Extract the topic from a text.
|
18
18
|
module TopicWords
|
19
19
|
extend Group
|
20
20
|
self.type = :annotator
|
21
|
-
self.targets = [:collection, :document, :
|
21
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
22
22
|
end
|
23
23
|
# Extract named entities from texts.
|
24
24
|
module NamedEntity
|
@@ -27,15 +27,15 @@ module Treat
|
|
27
27
|
self.targets = [:entity]
|
28
28
|
end
|
29
29
|
# Extract the key sentences from a text.
|
30
|
-
module
|
30
|
+
module Keywords
|
31
31
|
extend Group
|
32
|
-
self.type = :
|
33
|
-
self.targets = [:collection, :document, :
|
32
|
+
self.type = :annotator
|
33
|
+
self.targets = [:collection, :document, :zone, :sentence]
|
34
34
|
end
|
35
35
|
# This module should be moved out of here ASAP.
|
36
36
|
module Statistics
|
37
37
|
extend Group
|
38
|
-
self.type = :
|
38
|
+
self.type = :annotator
|
39
39
|
self.targets = [:entity]
|
40
40
|
self.default = :none
|
41
41
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
class Abw
|
5
|
+
require 'rexml/document'
|
6
|
+
require 'rexml/streamlistener'
|
7
|
+
def self.read(document, options = {})
|
8
|
+
xml_h = AbiWordXmlHandler.new(
|
9
|
+
REXML::Document.parse_stream((IO.read(document.file)), xml_h))
|
10
|
+
document << xml_h.plain_text
|
11
|
+
document
|
12
|
+
end
|
13
|
+
class AbiWordXmlHandler
|
14
|
+
include REXML::StreamListener
|
15
|
+
attr_reader :plain_text
|
16
|
+
def initialize
|
17
|
+
@plain_text = ""
|
18
|
+
end
|
19
|
+
def text s
|
20
|
+
begin
|
21
|
+
s = s.strip
|
22
|
+
if s.length > 0
|
23
|
+
@plain_text << s
|
24
|
+
@plain_text << "\n"
|
25
|
+
end
|
26
|
+
end if s != 'AbiWord' && s != 'application/x-abiword'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -6,26 +6,28 @@ module Treat
|
|
6
6
|
# the appropriate reader based on the file
|
7
7
|
# extension of the supplied document.
|
8
8
|
class Autoselect
|
9
|
-
# A list of image extensions that should be routed
|
10
|
-
# to the Ocropus OCR engine.
|
9
|
+
# A list of image extensions that should be routed to OCR.
|
11
10
|
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
11
|
+
# Default options.
|
12
|
+
DefaultOptions = {:ocr => :ocropus}
|
12
13
|
# Select the appropriate reader based on the format
|
13
14
|
# of the filename in document.
|
14
15
|
#
|
15
16
|
# Options:
|
16
|
-
#
|
17
|
-
|
17
|
+
#
|
18
|
+
# - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
|
19
|
+
def self.read(document, options)
|
20
|
+
options = DefaultOptions.merge(options)
|
18
21
|
ext = document.file.split('.')[-1]
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
reader = ext
|
23
|
-
end
|
22
|
+
reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
|
23
|
+
reader = 'html' if reader == 'htm'
|
24
|
+
reader = 'yaml' if reader == 'yml'
|
24
25
|
begin
|
25
26
|
r = Treat::Formatters::Readers.const_get(cc(reader))
|
26
|
-
rescue NameError
|
27
|
+
rescue NameError => e
|
28
|
+
puts e.message
|
27
29
|
raise Treat::Exception,
|
28
|
-
"Cannot find a
|
30
|
+
"Cannot find a reader for format: '#{ext}'."
|
29
31
|
end
|
30
32
|
document = r.read(document, options)
|
31
33
|
end
|
@@ -1,11 +1,31 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Readers
|
4
|
+
# A temporary HTML reader; simply strips the
|
5
|
+
# document of all of its markup.
|
4
6
|
class HTML
|
7
|
+
# Require Hpricot.
|
8
|
+
silence_warnings { require 'hpricot' }
|
9
|
+
# By default, backup the HTML text while cleaning.
|
10
|
+
DefaultOptions = { clean: true, backup: false }
|
11
|
+
# Read the HTML document and strip it of its markup.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
#
|
15
|
+
# - (Boolean) :clean => whether to strip HTML markup.
|
16
|
+
# - (Boolean) :backup => whether to backup the HTML
|
17
|
+
# markup while cleaning.
|
5
18
|
def self.read(document, options = {})
|
19
|
+
options = DefaultOptions.merge(options)
|
6
20
|
f = File.read(document.file)
|
7
21
|
document << Treat::Entities::Entity.from_string(f)
|
8
|
-
|
22
|
+
if options[:clean]
|
23
|
+
document.each do |section|
|
24
|
+
section.set :html_value, section.value if options[:backup]
|
25
|
+
section.value = Hpricot(section.value).inner_text
|
26
|
+
end
|
27
|
+
end
|
28
|
+
document
|
9
29
|
end
|
10
30
|
end
|
11
31
|
end
|
@@ -15,11 +15,11 @@ module Treat
|
|
15
15
|
# DFKI and U. Kaiserslautern, Germany.
|
16
16
|
class Ocropus
|
17
17
|
# Read a file using the Google Ocropus reader.
|
18
|
+
#
|
19
|
+
# Options: none.
|
18
20
|
def self.read(document, options = {})
|
19
21
|
create_temp_file(:txt) do |tmp|
|
20
|
-
|
21
|
-
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
22
|
-
end
|
22
|
+
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
23
23
|
f = File.read(tmp)
|
24
24
|
document << Treat::Entities::Entity.from_string(f)
|
25
25
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
class Odt
|
5
|
+
# Build an entity from a string in plain text format.
|
6
|
+
def self.read(document, options = {})
|
7
|
+
f = File.read(document.file)
|
8
|
+
f = f.force_encoding("UTF-8")
|
9
|
+
xml_h = OOXmlHandler.new(
|
10
|
+
REXML::Document.parse_stream(f, xml_h)
|
11
|
+
)
|
12
|
+
document << xml_h.plain_text
|
13
|
+
document
|
14
|
+
end
|
15
|
+
|
16
|
+
class OOXmlHandler
|
17
|
+
require 'rexml/document'
|
18
|
+
require 'rexml/streamlistener'
|
19
|
+
include REXML::StreamListener
|
20
|
+
attr_reader :plain_text
|
21
|
+
def initialize
|
22
|
+
@plain_text = ""
|
23
|
+
end
|
24
|
+
def tag_start(name, attrs)
|
25
|
+
@last_name = name
|
26
|
+
end
|
27
|
+
def text(s)
|
28
|
+
if @last_name.index('text')
|
29
|
+
s = s.strip
|
30
|
+
if s.length > 0
|
31
|
+
@plain_text << s
|
32
|
+
@plain_text << "\n"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -1,9 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Readers
|
4
|
+
# A wrapper for the Poppler pdf2text utility, which
|
5
|
+
# extracts the text from a PDF file.
|
4
6
|
class PDF
|
5
|
-
|
6
|
-
#
|
7
|
+
# Read a PDF file using the Poppler pdf2text utility.
|
8
|
+
#
|
9
|
+
# Options: none.
|
7
10
|
def self.read(document, options = {})
|
8
11
|
create_temp_file(:txt) do |tmp|
|
9
12
|
`pdftotext #{document.file} #{tmp} `.strip
|
@@ -4,6 +4,8 @@ module Treat
|
|
4
4
|
# This class simply reads a plain text file.
|
5
5
|
class Txt
|
6
6
|
# Build an entity from a string in plain text format.
|
7
|
+
#
|
8
|
+
# Options: none.
|
7
9
|
def self.read(document, options = {})
|
8
10
|
f = File.read(document.file)
|
9
11
|
document << Treat::Entities::Entity.from_string(f)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Serializers
|
4
|
-
# This class converts an entity to XML format.
|
4
|
+
# This class converts an entity to a storable XML format.
|
5
5
|
class XML
|
6
6
|
# Reauire the Nokogiri XML parser.
|
7
7
|
require 'nokogiri'
|
@@ -9,7 +9,8 @@ module Treat
|
|
9
9
|
def self.serialize(entity, options = {})
|
10
10
|
options = {:indent => 0} if options[:indent].nil?
|
11
11
|
if options[:indent] == 0
|
12
|
-
|
12
|
+
enc = entity.encoding(:r_chardet19).to_s.gsub('_', '-').upcase
|
13
|
+
string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>"
|
13
14
|
else
|
14
15
|
string = ''
|
15
16
|
end
|
@@ -1,7 +1,13 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# This class doesn't perform any unserializing;
|
5
|
+
# it simply routes the document to an unserializer
|
6
|
+
# based on the file extension of the document.
|
4
7
|
class Autoselect
|
8
|
+
# Unserialize any supported file format.
|
9
|
+
#
|
10
|
+
# Options: none.
|
5
11
|
def self.unserialize(document, options = {})
|
6
12
|
ext = document.file.split('.')[-1]
|
7
13
|
if ext == 'yaml' || ext == 'yml'
|
@@ -9,7 +15,7 @@ module Treat
|
|
9
15
|
elsif ext == 'xml'
|
10
16
|
document.unserialize(:xml)
|
11
17
|
else
|
12
|
-
raise "File #{document.file} was not recognized"+
|
18
|
+
raise "File #{document.file} was not recognized "+
|
13
19
|
"as a supported serialized format."
|
14
20
|
end
|
15
21
|
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# Recreates the entity tree corresponding to
|
5
|
+
# a serialized XML file.
|
4
6
|
class XML
|
5
7
|
require 'nokogiri'
|
6
|
-
|
8
|
+
# Unserialize an entity stored in XML format.
|
9
|
+
#
|
10
|
+
# Options: none.
|
7
11
|
def self.unserialize(document, options = {})
|
8
12
|
# Read in the XML file.
|
9
13
|
xml = File.read(document.file)
|
@@ -59,6 +63,7 @@ module Treat
|
|
59
63
|
current_value = xml_reader.value.strip
|
60
64
|
if current_value && current_value != ''
|
61
65
|
current_element.value = current_value
|
66
|
+
current_element.register_token(current_element)
|
62
67
|
end
|
63
68
|
end
|
64
69
|
|
@@ -1,10 +1,14 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Unserializers
|
4
|
+
# This class is a wrapper for the Psych YAML
|
5
|
+
# parser; it unserializes YAML files.
|
4
6
|
class YAML
|
5
7
|
# Require the Psych YAML parser.
|
6
8
|
require 'psych'
|
7
|
-
# Unserialize a YAML file
|
9
|
+
# Unserialize a YAML file.
|
10
|
+
#
|
11
|
+
# Options: none.
|
8
12
|
def self.unserialize(document, options = {})
|
9
13
|
document << ::Psych.load(File.read(document.file))
|
10
14
|
document
|