treat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
+ # Experimental algorithm to calculate the transition
5
+ # probability of an observed word.
4
6
  class TransitionProbability
5
7
 
6
8
  # Find the transition probability.
@@ -18,14 +20,16 @@ module Treat
18
20
  next unless tm[f1][v1]
19
21
 
20
22
  relationships.each do |relationship|
21
- relatives = target.send(relationship)
23
+ relatives = entity.send(relationship)
22
24
  relatives = [relatives] unless relatives.is_a? Array
23
25
  relatives.each do |relative|
24
26
  next if relative.nil? || !relative.has?(f2)
25
27
  v2 = relative.send(f2)
26
- if tm[f1][v1][relationship][f2][v2]
27
- score += tm[f1][v1][relationship][f2][v2]
28
- count += 1
28
+ if tm[f1][v1][relationship] &&
29
+ tm[f1][v1][relationship][f2] &&
30
+ tm[f1][v1][relationship][f2][v2]
31
+ score += tm[f1][v1][relationship][f2][v2]
32
+ count += 1
29
33
  end
30
34
  end
31
35
  end
@@ -1,8 +1,16 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
+ # A wrapper for the 'chronic' gem, which parses
5
+ # time and date information.
6
+ #
7
+ # Project website: http://chronic.rubyforge.org/
4
8
  class Chronic
5
9
  silence_warnings { require 'chronic' }
10
+ # Return the time information contained within the entity
11
+ # by parsing it with the 'chronic' gem.
12
+ #
13
+ # Options: none.
6
14
  def self.time(entity, options = {})
7
15
  silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
8
16
  end
@@ -1,8 +1,14 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
+ # A wrapper for Ruby's native date/time parsing.
4
5
  module Native
5
6
  require 'date'
7
+ # Return a DateTime object representing the date/time
8
+ # contained within the entity, using Ruby's native
9
+ # date/time parser.
10
+ #
11
+ # Options: none.
6
12
  def self.time(entity, options = {})
7
13
  ::DateTime.parse(entity.to_s)
8
14
  end
@@ -1,45 +1,53 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
- =begin
5
- Annotations
6
-
7
- Type examples
8
-
9
- single "lunch with megan tomorrow at noon"
10
- daily "Art exhibit until March 1st"
11
- weekly "math class every wed from 8-11am"
12
- daymonthly "open bar at joes the first friday of every month"
13
- datemonthly "pay credit card bill on the 22nd of each month"
14
-
15
- =end
4
+ # A wrapper for the 'nickel' gem, which parses
5
+ # times and dates and supplies additional information
6
+ # concerning these. The additional information supplied
7
+ # that this class annotates entities with is:
8
+ #
9
+ # - time_recurrence: frequency of recurrence in words*.
10
+ # - time_recurrence_interval: frequency of recurrence in days.
11
+ # - start_time: a DateTime object representing the beginning of
12
+ # an event.
13
+ # - end_time: a DateTime object representing the end of an event.
14
+ #
15
+ # Examples of values for time_recurrence are:
16
+ #
17
+ # - single: "lunch with megan tomorrow at noon"
18
+ # - daily: "Art exhibit until March 1st"
19
+ # - weekly: "math class every wed from 8-11am"
20
+ # - daymonthly: "open bar at joes the first friday of every month"
21
+ # - datemonthly: "pay credit card bill on the 22nd of each month"
22
+ #
23
+ # Project website: http://naturalinputs.com/
16
24
  module Nickel
17
25
  require 'date'
18
26
  silence_warnings { require 'nickel' }
27
+ # Extract time information from a bit of text.
19
28
  def self.time(entity, options = {})
20
29
  n = silence_warnings { ::Nickel.parse(entity.to_s) }
21
30
  occ = n.occurrences[0]
22
- # Find the words..
31
+
23
32
  rec = occ.type.to_s.gsub('single', 'once').intern
24
33
  entity.set :time_recurrence, rec
25
- interval = occ.interval ? occ.interval.intern : :none
34
+ interval = occ.interval ? occ.interval : :none
26
35
  entity.set :time_recurrence_interval, interval
27
-
36
+
28
37
  s = [occ.start_date, occ.start_time]
29
38
  ds = [s[0].year, s[0].month, s[0].day] if s[0]
30
- ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
31
-
39
+ #ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
40
+
32
41
  e = [occ.end_date, occ.end_time]
33
42
  de = [e[0].year, e[0].month, e[0].day] if e[0]
34
- te = [e[1].hour, e[1].min, e[1].sec] if e[1]
35
-
36
- entity.set :start_time, ::DateTime.civil(*ds, *ts) if ds
37
- entity.set :end_time, ::DateTime.civil(*de, *te) if de
38
-
43
+ #te = [e[1].hour, e[1].min, e[1].sec] if e[1]
44
+
45
+ entity.set :start_time, ::DateTime.civil(*ds) if ds
46
+ entity.set :end_time, ::DateTime.civil(*de) if de
47
+
39
48
  entity.start_time
40
49
  end
41
50
  end
42
51
  end
43
52
  end
44
53
  end
45
-
@@ -9,6 +9,8 @@ module Treat
9
9
  # Blei, David M., Ng, Andrew Y., and Jordan, Michael
10
10
  # I. 2003. Latent dirichlet allocation. Journal of
11
11
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
12
+ #
13
+ # Project website: https://github.com/ealdent/lda-ruby
12
14
  class LDA
13
15
  # Require the lda-ruby gem.
14
16
  silence_warnings { require 'lda-ruby' }
@@ -17,25 +19,28 @@ module Treat
17
19
  Lda::TextCorpus.class_eval do
18
20
  # Ruby, Y U NO SHUT UP!
19
21
  silence_warnings { undef :initialize }
20
- # Redefine initialize to take in an array of texts.
21
- def initialize(texts)
22
+ # Redefine initialize to take in an array of sections
23
+ def initialize(sections)
22
24
  super(nil)
23
- texts.each do |text|
24
- add_document(Lda::TextDocument.new(self, text))
25
+ sections.each do |section|
26
+ add_document(Lda::TextDocument.new(self, section))
25
27
  end
26
28
  end
27
29
  end
30
+ # Default options for the LDA algorithm.
31
+ DefaultOptions = {
32
+ topics: 20,
33
+ words_per_topic: 10,
34
+ iterations: 20
35
+ }
36
+ # Retrieve the topic words of a collection.
28
37
  def self.topic_words(collection, options = {})
29
- # Set the options
30
- options[:words_per_topic] ||= 10
31
- options[:topics] ||= 20
32
- options[:iterations] ||= 20
33
-
38
+ options = DefaultOptions.merge(options)
34
39
  # Create a corpus with the collection
35
- texts = collection.texts.collect do |t|
40
+ sections = collection.sections.collect do |t|
36
41
  t.to_s.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
37
42
  end
38
- corpus = Lda::TextCorpus.new(texts)
43
+ corpus = Lda::TextCorpus.new(sections)
39
44
 
40
45
  # Create an Lda object for training
41
46
  lda = Lda::Lda.new(corpus)
@@ -43,15 +48,15 @@ module Treat
43
48
  lda.max_iter = options[:iterations]
44
49
  # Run the EM algorithm using random starting points
45
50
  silence_streams(STDOUT, STDERR) { lda.em('random') }
46
-
51
+
47
52
  # Load the vocabulary.
48
53
  if options[:vocabulary]
49
54
  lda.load_vocabulary(options[:vocabulary])
50
55
  end
51
-
52
- # Get the topic words and annotate the text.
56
+
57
+ # Get the topic words and annotate the section.
53
58
  topic_words = lda.top_words(options[:words_per_topic])
54
-
59
+
55
60
  topic_words.each do |i, words|
56
61
  collection.each_word do |word|
57
62
  if words.include?(word)
@@ -62,7 +67,7 @@ module Treat
62
67
  end
63
68
  end
64
69
  end
65
-
70
+
66
71
  topic_words
67
72
  end
68
73
  end
@@ -6,9 +6,9 @@ module Treat
6
6
  #
7
7
  # Copyright 2005 Mark Watson. All rights reserved.
8
8
  # This software is released under the GPL.
9
- #
10
- # Modifications for inclusion in Treat by
11
- # Louis Mullie (2011).
9
+ # Rewrite for inclusion in Treat by Louis Mullie (2011).
10
+ #
11
+ # Original project website: http://www.markwatson.com/opensource/
12
12
  class Reuters
13
13
  # Require the Nokogiri XML parser.
14
14
  require 'nokogiri'
@@ -17,6 +17,8 @@ module Treat
17
17
  @@region = {}
18
18
  @@topics = {}
19
19
  # Get the topic of the text.
20
+ #
21
+ # Options: none.
20
22
  def self.topics(text, options = {})
21
23
  stems = []
22
24
  @@reduce = 0
@@ -33,7 +35,7 @@ module Treat
33
35
  topics = score_words(@@industry, stems)
34
36
  topics = topics.merge(score_words(@@region, stems))
35
37
  topics = topics.merge(score_words(@@topics, stems))
36
- Treat::Feature.new(topics)
38
+ #Treat::Feature.new(topics)
37
39
  end
38
40
  # Read the topics from the XML files.
39
41
  def self.get_topics
@@ -6,19 +6,19 @@ module Treat
6
6
  module Time
7
7
  extend Group
8
8
  self.type = :annotator
9
- self.targets = [:word, :constituent, :symbol]
9
+ self.targets = [:sentence, :word, :constituent, :symbol]
10
10
  end
11
11
  # Extract the topic from a text.
12
12
  module Topics
13
13
  extend Group
14
14
  self.type = :annotator
15
- self.targets = [:collection, :document, :text, :zone, :sentence]
15
+ self.targets = [:collection, :document, :zone, :sentence]
16
16
  end
17
17
  # Extract the topic from a text.
18
18
  module TopicWords
19
19
  extend Group
20
20
  self.type = :annotator
21
- self.targets = [:collection, :document, :text, :zone, :sentence]
21
+ self.targets = [:collection, :document, :zone, :sentence]
22
22
  end
23
23
  # Extract named entities from texts.
24
24
  module NamedEntity
@@ -27,15 +27,15 @@ module Treat
27
27
  self.targets = [:entity]
28
28
  end
29
29
  # Extract the key sentences from a text.
30
- module KeySentences
30
+ module Keywords
31
31
  extend Group
32
- self.type = :computer
33
- self.targets = [:collection, :document, :text, :zone, :sentence]
32
+ self.type = :annotator
33
+ self.targets = [:collection, :document, :zone, :sentence]
34
34
  end
35
35
  # This module should be moved out of here ASAP.
36
36
  module Statistics
37
37
  extend Group
38
- self.type = :computer
38
+ self.type = :annotator
39
39
  self.targets = [:entity]
40
40
  self.default = :none
41
41
  end
@@ -0,0 +1,32 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Abw
5
+ require 'rexml/document'
6
+ require 'rexml/streamlistener'
7
+ def self.read(document, options = {})
8
+ xml_h = AbiWordXmlHandler.new(
9
+ REXML::Document.parse_stream((IO.read(document.file)), xml_h))
10
+ document << xml_h.plain_text
11
+ document
12
+ end
13
+ class AbiWordXmlHandler
14
+ include REXML::StreamListener
15
+ attr_reader :plain_text
16
+ def initialize
17
+ @plain_text = ""
18
+ end
19
+ def text s
20
+ begin
21
+ s = s.strip
22
+ if s.length > 0
23
+ @plain_text << s
24
+ @plain_text << "\n"
25
+ end
26
+ end if s != 'AbiWord' && s != 'application/x-abiword'
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -6,26 +6,28 @@ module Treat
6
6
  # the appropriate reader based on the file
7
7
  # extension of the supplied document.
8
8
  class Autoselect
9
- # A list of image extensions that should be routed
10
- # to the Ocropus OCR engine.
9
+ # A list of image extensions that should be routed to OCR.
11
10
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
11
+ # Default options.
12
+ DefaultOptions = {:ocr => :ocropus}
12
13
  # Select the appropriate reader based on the format
13
14
  # of the filename in document.
14
15
  #
15
16
  # Options:
16
- # :ocr => :ocropus | :gocr (the OCR engine to use).
17
- def self.read(document, options = {:ocr => :ocropus})
17
+ #
18
+ # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
19
+ def self.read(document, options)
20
+ options = DefaultOptions.merge(options)
18
21
  ext = document.file.split('.')[-1]
19
- if ImageExtensions.include?(ext)
20
- reader = 'ocropus'
21
- else
22
- reader = ext
23
- end
22
+ reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
23
+ reader = 'html' if reader == 'htm'
24
+ reader = 'yaml' if reader == 'yml'
24
25
  begin
25
26
  r = Treat::Formatters::Readers.const_get(cc(reader))
26
- rescue NameError
27
+ rescue NameError => e
28
+ puts e.message
27
29
  raise Treat::Exception,
28
- "Cannot find a default reader for format: '#{ext}'."
30
+ "Cannot find a reader for format: '#{ext}'."
29
31
  end
30
32
  document = r.read(document, options)
31
33
  end
@@ -0,0 +1,13 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Doc
5
+ def self.read(document, options = {})
6
+ f = `antiword #{document.file}`
7
+ document << Treat::Entities::Entity.from_string(f)
8
+ document
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -10,6 +10,8 @@ module Treat
10
10
  # Project site: http://jocr.sourceforge.net
11
11
  class GOCR
12
12
  # Read a file using the GOCR reader.
13
+ #
14
+ # Options: none.
13
15
  def self.read(document, options = {})
14
16
  create_temp_file(:pgm) do |tmp|
15
17
  `convert #{document.file} #{tmp}`
@@ -1,11 +1,31 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Readers
4
+ # A temporary HTML reader; simply strips the
5
+ # document of all of its markup.
4
6
  class HTML
7
+ # Require Hpricot.
8
+ silence_warnings { require 'hpricot' }
9
+ # By default, backup the HTML text while cleaning.
10
+ DefaultOptions = { clean: true, backup: false }
11
+ # Read the HTML document and strip it of its markup.
12
+ #
13
+ # Options:
14
+ #
15
+ # - (Boolean) :clean => whether to strip HTML markup.
16
+ # - (Boolean) :backup => whether to backup the HTML
17
+ # markup while cleaning.
5
18
  def self.read(document, options = {})
19
+ options = DefaultOptions.merge(options)
6
20
  f = File.read(document.file)
7
21
  document << Treat::Entities::Entity.from_string(f)
8
- document.clean(:html)
22
+ if options[:clean]
23
+ document.each do |section|
24
+ section.set :html_value, section.value if options[:backup]
25
+ section.value = Hpricot(section.value).inner_text
26
+ end
27
+ end
28
+ document
9
29
  end
10
30
  end
11
31
  end
@@ -15,11 +15,11 @@ module Treat
15
15
  # DFKI and U. Kaiserslautern, Germany.
16
16
  class Ocropus
17
17
  # Read a file using the Google Ocropus reader.
18
+ #
19
+ # Options: none.
18
20
  def self.read(document, options = {})
19
21
  create_temp_file(:txt) do |tmp|
20
- capture(:stderr) do
21
- `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
22
- end
22
+ `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
23
23
  f = File.read(tmp)
24
24
  document << Treat::Entities::Entity.from_string(f)
25
25
  end
@@ -0,0 +1,41 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Odt
5
+ # Build an entity from a string in plain text format.
6
+ def self.read(document, options = {})
7
+ f = File.read(document.file)
8
+ f = f.force_encoding("UTF-8")
9
+ xml_h = OOXmlHandler.new(
10
+ REXML::Document.parse_stream(f, xml_h)
11
+ )
12
+ document << xml_h.plain_text
13
+ document
14
+ end
15
+
16
+ class OOXmlHandler
17
+ require 'rexml/document'
18
+ require 'rexml/streamlistener'
19
+ include REXML::StreamListener
20
+ attr_reader :plain_text
21
+ def initialize
22
+ @plain_text = ""
23
+ end
24
+ def tag_start(name, attrs)
25
+ @last_name = name
26
+ end
27
+ def text(s)
28
+ if @last_name.index('text')
29
+ s = s.strip
30
+ if s.length > 0
31
+ @plain_text << s
32
+ @plain_text << "\n"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -1,9 +1,12 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Readers
4
+ # A wrapper for the Poppler pdf2text utility, which
5
+ # extracts the text from a PDF file.
4
6
  class PDF
5
- require 'fileutils'
6
- # Read a file using the Poppler pdf2text utility.
7
+ # Read a PDF file using the Poppler pdf2text utility.
8
+ #
9
+ # Options: none.
7
10
  def self.read(document, options = {})
8
11
  create_temp_file(:txt) do |tmp|
9
12
  `pdftotext #{document.file} #{tmp} `.strip
@@ -4,6 +4,8 @@ module Treat
4
4
  # This class simply reads a plain text file.
5
5
  class Txt
6
6
  # Build an entity from a string in plain text format.
7
+ #
8
+ # Options: none.
7
9
  def self.read(document, options = {})
8
10
  f = File.read(document.file)
9
11
  document << Treat::Entities::Entity.from_string(f)
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Serializers
4
- # This class converts an entity to XML format.
4
+ # This class converts an entity to a storable XML format.
5
5
  class XML
6
6
  # Reauire the Nokogiri XML parser.
7
7
  require 'nokogiri'
@@ -9,7 +9,8 @@ module Treat
9
9
  def self.serialize(entity, options = {})
10
10
  options = {:indent => 0} if options[:indent].nil?
11
11
  if options[:indent] == 0
12
- string = '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>'
12
+ enc = entity.encoding(:r_chardet19).to_s.gsub('_', '-').upcase
13
+ string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>"
13
14
  else
14
15
  string = ''
15
16
  end
@@ -6,6 +6,8 @@ module Treat
6
6
  # This class serializes entities in YAML format.
7
7
  class YAML
8
8
  # Serialize an entity in YAML format.
9
+ #
10
+ # Options: none.
9
11
  def self.serialize(entity, options = {})
10
12
  ::Psych.dump(entity)
11
13
  end
@@ -1,7 +1,13 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # This class doesn't perform any unserializing;
5
+ # it simply routes the document to an unserializer
6
+ # based on the file extension of the document.
4
7
  class Autoselect
8
+ # Unserialize any supported file format.
9
+ #
10
+ # Options: none.
5
11
  def self.unserialize(document, options = {})
6
12
  ext = document.file.split('.')[-1]
7
13
  if ext == 'yaml' || ext == 'yml'
@@ -9,7 +15,7 @@ module Treat
9
15
  elsif ext == 'xml'
10
16
  document.unserialize(:xml)
11
17
  else
12
- raise "File #{document.file} was not recognized"+
18
+ raise "File #{document.file} was not recognized "+
13
19
  "as a supported serialized format."
14
20
  end
15
21
  end
@@ -1,9 +1,13 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # Recreates the entity tree corresponding to
5
+ # a serialized XML file.
4
6
  class XML
5
7
  require 'nokogiri'
6
-
8
+ # Unserialize an entity stored in XML format.
9
+ #
10
+ # Options: none.
7
11
  def self.unserialize(document, options = {})
8
12
  # Read in the XML file.
9
13
  xml = File.read(document.file)
@@ -59,6 +63,7 @@ module Treat
59
63
  current_value = xml_reader.value.strip
60
64
  if current_value && current_value != ''
61
65
  current_element.value = current_value
66
+ current_element.register_token(current_element)
62
67
  end
63
68
  end
64
69
 
@@ -1,10 +1,14 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # This class is a wrapper for the Psych YAML
5
+ # parser; it unserializes YAML files.
4
6
  class YAML
5
7
  # Require the Psych YAML parser.
6
8
  require 'psych'
7
- # Unserialize a YAML file representing an entity.
9
+ # Unserialize a YAML file.
10
+ #
11
+ # Options: none.
8
12
  def self.unserialize(document, options = {})
9
13
  document << ::Psych.load(File.read(document.file))
10
14
  document