treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
+ # Experimental algorithm to calculate the transition
5
+ # probability of an observed word.
4
6
  class TransitionProbability
5
7
 
6
8
  # Find the transition probability.
@@ -18,14 +20,16 @@ module Treat
18
20
  next unless tm[f1][v1]
19
21
 
20
22
  relationships.each do |relationship|
21
- relatives = target.send(relationship)
23
+ relatives = entity.send(relationship)
22
24
  relatives = [relatives] unless relatives.is_a? Array
23
25
  relatives.each do |relative|
24
26
  next if relative.nil? || !relative.has?(f2)
25
27
  v2 = relative.send(f2)
26
- if tm[f1][v1][relationship][f2][v2]
27
- score += tm[f1][v1][relationship][f2][v2]
28
- count += 1
28
+ if tm[f1][v1][relationship] &&
29
+ tm[f1][v1][relationship][f2] &&
30
+ tm[f1][v1][relationship][f2][v2]
31
+ score += tm[f1][v1][relationship][f2][v2]
32
+ count += 1
29
33
  end
30
34
  end
31
35
  end
@@ -1,8 +1,16 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
+ # A wrapper for the 'chronic' gem, which parses
5
+ # time and date information.
6
+ #
7
+ # Project website: http://chronic.rubyforge.org/
4
8
  class Chronic
5
9
  silence_warnings { require 'chronic' }
10
+ # Return the time information contained within the entity
11
+ # by parsing it with the 'chronic' gem.
12
+ #
13
+ # Options: none.
6
14
  def self.time(entity, options = {})
7
15
  silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
8
16
  end
@@ -1,8 +1,14 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
+ # A wrapper for Ruby's native date/time parsing.
4
5
  module Native
5
6
  require 'date'
7
+ # Return a DateTime object representing the date/time
8
+ # contained within the entity, using Ruby's native
9
+ # date/time parser.
10
+ #
11
+ # Options: none.
6
12
  def self.time(entity, options = {})
7
13
  ::DateTime.parse(entity.to_s)
8
14
  end
@@ -1,45 +1,53 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
- =begin
5
- Annotations
6
-
7
- Type examples
8
-
9
- single "lunch with megan tomorrow at noon"
10
- daily "Art exhibit until March 1st"
11
- weekly "math class every wed from 8-11am"
12
- daymonthly "open bar at joes the first friday of every month"
13
- datemonthly "pay credit card bill on the 22nd of each month"
14
-
15
- =end
4
+ # A wrapper for the 'nickel' gem, which parses
5
+ # times and dates and supplies additional information
6
+ # concerning these. The additional information supplied
7
+ # that this class annotates entities with is:
8
+ #
9
+ # - time_recurrence: frequency of recurrence in words*.
10
+ # - time_recurrence_interval: frequency of recurrence in days.
11
+ # - start_time: a DateTime object representing the beginning of
12
+ # an event.
13
+ # - end_time: a DateTime object representing the end of an event.
14
+ #
15
+ # Examples of values for time_recurrence are:
16
+ #
17
+ # - single: "lunch with megan tomorrow at noon"
18
+ # - daily: "Art exhibit until March 1st"
19
+ # - weekly: "math class every wed from 8-11am"
20
+ # - daymonthly: "open bar at joes the first friday of every month"
21
+ # - datemonthly: "pay credit card bill on the 22nd of each month"
22
+ #
23
+ # Project website: http://naturalinputs.com/
16
24
  module Nickel
17
25
  require 'date'
18
26
  silence_warnings { require 'nickel' }
27
+ # Extract time information from a bit of text.
19
28
  def self.time(entity, options = {})
20
29
  n = silence_warnings { ::Nickel.parse(entity.to_s) }
21
30
  occ = n.occurrences[0]
22
- # Find the words..
31
+
23
32
  rec = occ.type.to_s.gsub('single', 'once').intern
24
33
  entity.set :time_recurrence, rec
25
- interval = occ.interval ? occ.interval.intern : :none
34
+ interval = occ.interval ? occ.interval : :none
26
35
  entity.set :time_recurrence_interval, interval
27
-
36
+
28
37
  s = [occ.start_date, occ.start_time]
29
38
  ds = [s[0].year, s[0].month, s[0].day] if s[0]
30
- ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
31
-
39
+ #ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
40
+
32
41
  e = [occ.end_date, occ.end_time]
33
42
  de = [e[0].year, e[0].month, e[0].day] if e[0]
34
- te = [e[1].hour, e[1].min, e[1].sec] if e[1]
35
-
36
- entity.set :start_time, ::DateTime.civil(*ds, *ts) if ds
37
- entity.set :end_time, ::DateTime.civil(*de, *te) if de
38
-
43
+ #te = [e[1].hour, e[1].min, e[1].sec] if e[1]
44
+
45
+ entity.set :start_time, ::DateTime.civil(*ds) if ds
46
+ entity.set :end_time, ::DateTime.civil(*de) if de
47
+
39
48
  entity.start_time
40
49
  end
41
50
  end
42
51
  end
43
52
  end
44
53
  end
45
-
@@ -9,6 +9,8 @@ module Treat
9
9
  # Blei, David M., Ng, Andrew Y., and Jordan, Michael
10
10
  # I. 2003. Latent dirichlet allocation. Journal of
11
11
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
12
+ #
13
+ # Project website: https://github.com/ealdent/lda-ruby
12
14
  class LDA
13
15
  # Require the lda-ruby gem.
14
16
  silence_warnings { require 'lda-ruby' }
@@ -17,25 +19,28 @@ module Treat
17
19
  Lda::TextCorpus.class_eval do
18
20
  # Ruby, Y U NO SHUT UP!
19
21
  silence_warnings { undef :initialize }
20
- # Redefine initialize to take in an array of texts.
21
- def initialize(texts)
22
+ # Redefine initialize to take in an array of sections
23
+ def initialize(sections)
22
24
  super(nil)
23
- texts.each do |text|
24
- add_document(Lda::TextDocument.new(self, text))
25
+ sections.each do |section|
26
+ add_document(Lda::TextDocument.new(self, section))
25
27
  end
26
28
  end
27
29
  end
30
+ # Default options for the LDA algorithm.
31
+ DefaultOptions = {
32
+ topics: 20,
33
+ words_per_topic: 10,
34
+ iterations: 20
35
+ }
36
+ # Retrieve the topic words of a collection.
28
37
  def self.topic_words(collection, options = {})
29
- # Set the options
30
- options[:words_per_topic] ||= 10
31
- options[:topics] ||= 20
32
- options[:iterations] ||= 20
33
-
38
+ options = DefaultOptions.merge(options)
34
39
  # Create a corpus with the collection
35
- texts = collection.texts.collect do |t|
40
+ sections = collection.sections.collect do |t|
36
41
  t.to_s.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
37
42
  end
38
- corpus = Lda::TextCorpus.new(texts)
43
+ corpus = Lda::TextCorpus.new(sections)
39
44
 
40
45
  # Create an Lda object for training
41
46
  lda = Lda::Lda.new(corpus)
@@ -43,15 +48,15 @@ module Treat
43
48
  lda.max_iter = options[:iterations]
44
49
  # Run the EM algorithm using random starting points
45
50
  silence_streams(STDOUT, STDERR) { lda.em('random') }
46
-
51
+
47
52
  # Load the vocabulary.
48
53
  if options[:vocabulary]
49
54
  lda.load_vocabulary(options[:vocabulary])
50
55
  end
51
-
52
- # Get the topic words and annotate the text.
56
+
57
+ # Get the topic words and annotate the section.
53
58
  topic_words = lda.top_words(options[:words_per_topic])
54
-
59
+
55
60
  topic_words.each do |i, words|
56
61
  collection.each_word do |word|
57
62
  if words.include?(word)
@@ -62,7 +67,7 @@ module Treat
62
67
  end
63
68
  end
64
69
  end
65
-
70
+
66
71
  topic_words
67
72
  end
68
73
  end
@@ -6,9 +6,9 @@ module Treat
6
6
  #
7
7
  # Copyright 2005 Mark Watson. All rights reserved.
8
8
  # This software is released under the GPL.
9
- #
10
- # Modifications for inclusion in Treat by
11
- # Louis Mullie (2011).
9
+ # Rewrite for inclusion in Treat by Louis Mullie (2011).
10
+ #
11
+ # Original project website: http://www.markwatson.com/opensource/
12
12
  class Reuters
13
13
  # Require the Nokogiri XML parser.
14
14
  require 'nokogiri'
@@ -17,6 +17,8 @@ module Treat
17
17
  @@region = {}
18
18
  @@topics = {}
19
19
  # Get the topic of the text.
20
+ #
21
+ # Options: none.
20
22
  def self.topics(text, options = {})
21
23
  stems = []
22
24
  @@reduce = 0
@@ -33,7 +35,7 @@ module Treat
33
35
  topics = score_words(@@industry, stems)
34
36
  topics = topics.merge(score_words(@@region, stems))
35
37
  topics = topics.merge(score_words(@@topics, stems))
36
- Treat::Feature.new(topics)
38
+ #Treat::Feature.new(topics)
37
39
  end
38
40
  # Read the topics from the XML files.
39
41
  def self.get_topics
@@ -6,19 +6,19 @@ module Treat
6
6
  module Time
7
7
  extend Group
8
8
  self.type = :annotator
9
- self.targets = [:word, :constituent, :symbol]
9
+ self.targets = [:sentence, :word, :constituent, :symbol]
10
10
  end
11
11
  # Extract the topic from a text.
12
12
  module Topics
13
13
  extend Group
14
14
  self.type = :annotator
15
- self.targets = [:collection, :document, :text, :zone, :sentence]
15
+ self.targets = [:collection, :document, :zone, :sentence]
16
16
  end
17
17
  # Extract the topic from a text.
18
18
  module TopicWords
19
19
  extend Group
20
20
  self.type = :annotator
21
- self.targets = [:collection, :document, :text, :zone, :sentence]
21
+ self.targets = [:collection, :document, :zone, :sentence]
22
22
  end
23
23
  # Extract named entities from texts.
24
24
  module NamedEntity
@@ -27,15 +27,15 @@ module Treat
27
27
  self.targets = [:entity]
28
28
  end
29
29
  # Extract the key sentences from a text.
30
- module KeySentences
30
+ module Keywords
31
31
  extend Group
32
- self.type = :computer
33
- self.targets = [:collection, :document, :text, :zone, :sentence]
32
+ self.type = :annotator
33
+ self.targets = [:collection, :document, :zone, :sentence]
34
34
  end
35
35
  # This module should be moved out of here ASAP.
36
36
  module Statistics
37
37
  extend Group
38
- self.type = :computer
38
+ self.type = :annotator
39
39
  self.targets = [:entity]
40
40
  self.default = :none
41
41
  end
@@ -0,0 +1,32 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Abw
5
+ require 'rexml/document'
6
+ require 'rexml/streamlistener'
7
+ def self.read(document, options = {})
8
+ xml_h = AbiWordXmlHandler.new(
9
+ REXML::Document.parse_stream((IO.read(document.file)), xml_h))
10
+ document << xml_h.plain_text
11
+ document
12
+ end
13
+ class AbiWordXmlHandler
14
+ include REXML::StreamListener
15
+ attr_reader :plain_text
16
+ def initialize
17
+ @plain_text = ""
18
+ end
19
+ def text s
20
+ begin
21
+ s = s.strip
22
+ if s.length > 0
23
+ @plain_text << s
24
+ @plain_text << "\n"
25
+ end
26
+ end if s != 'AbiWord' && s != 'application/x-abiword'
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -6,26 +6,28 @@ module Treat
6
6
  # the appropriate reader based on the file
7
7
  # extension of the supplied document.
8
8
  class Autoselect
9
- # A list of image extensions that should be routed
10
- # to the Ocropus OCR engine.
9
+ # A list of image extensions that should be routed to OCR.
11
10
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
11
+ # Default options.
12
+ DefaultOptions = {:ocr => :ocropus}
12
13
  # Select the appropriate reader based on the format
13
14
  # of the filename in document.
14
15
  #
15
16
  # Options:
16
- # :ocr => :ocropus | :gocr (the OCR engine to use).
17
- def self.read(document, options = {:ocr => :ocropus})
17
+ #
18
+ # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
19
+ def self.read(document, options)
20
+ options = DefaultOptions.merge(options)
18
21
  ext = document.file.split('.')[-1]
19
- if ImageExtensions.include?(ext)
20
- reader = 'ocropus'
21
- else
22
- reader = ext
23
- end
22
+ reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
23
+ reader = 'html' if reader == 'htm'
24
+ reader = 'yaml' if reader == 'yml'
24
25
  begin
25
26
  r = Treat::Formatters::Readers.const_get(cc(reader))
26
- rescue NameError
27
+ rescue NameError => e
28
+ puts e.message
27
29
  raise Treat::Exception,
28
- "Cannot find a default reader for format: '#{ext}'."
30
+ "Cannot find a reader for format: '#{ext}'."
29
31
  end
30
32
  document = r.read(document, options)
31
33
  end
@@ -0,0 +1,13 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Doc
5
+ def self.read(document, options = {})
6
+ f = `antiword #{document.file}`
7
+ document << Treat::Entities::Entity.from_string(f)
8
+ document
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -10,6 +10,8 @@ module Treat
10
10
  # Project site: http://jocr.sourceforge.net
11
11
  class GOCR
12
12
  # Read a file using the GOCR reader.
13
+ #
14
+ # Options: none.
13
15
  def self.read(document, options = {})
14
16
  create_temp_file(:pgm) do |tmp|
15
17
  `convert #{document.file} #{tmp}`
@@ -1,11 +1,31 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Readers
4
+ # A temporary HTML reader; simply strips the
5
+ # document of all of its markup.
4
6
  class HTML
7
+ # Require Hpricot.
8
+ silence_warnings { require 'hpricot' }
9
+ # By default, backup the HTML text while cleaning.
10
+ DefaultOptions = { clean: true, backup: false }
11
+ # Read the HTML document and strip it of its markup.
12
+ #
13
+ # Options:
14
+ #
15
+ # - (Boolean) :clean => whether to strip HTML markup.
16
+ # - (Boolean) :backup => whether to backup the HTML
17
+ # markup while cleaning.
5
18
  def self.read(document, options = {})
19
+ options = DefaultOptions.merge(options)
6
20
  f = File.read(document.file)
7
21
  document << Treat::Entities::Entity.from_string(f)
8
- document.clean(:html)
22
+ if options[:clean]
23
+ document.each do |section|
24
+ section.set :html_value, section.value if options[:backup]
25
+ section.value = Hpricot(section.value).inner_text
26
+ end
27
+ end
28
+ document
9
29
  end
10
30
  end
11
31
  end
@@ -15,11 +15,11 @@ module Treat
15
15
  # DFKI and U. Kaiserslautern, Germany.
16
16
  class Ocropus
17
17
  # Read a file using the Google Ocropus reader.
18
+ #
19
+ # Options: none.
18
20
  def self.read(document, options = {})
19
21
  create_temp_file(:txt) do |tmp|
20
- capture(:stderr) do
21
- `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
22
- end
22
+ `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
23
23
  f = File.read(tmp)
24
24
  document << Treat::Entities::Entity.from_string(f)
25
25
  end
@@ -0,0 +1,41 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class Odt
5
+ # Build an entity from a string in plain text format.
6
+ def self.read(document, options = {})
7
+ f = File.read(document.file)
8
+ f = f.force_encoding("UTF-8")
9
+ xml_h = OOXmlHandler.new(
10
+ REXML::Document.parse_stream(f, xml_h)
11
+ )
12
+ document << xml_h.plain_text
13
+ document
14
+ end
15
+
16
+ class OOXmlHandler
17
+ require 'rexml/document'
18
+ require 'rexml/streamlistener'
19
+ include REXML::StreamListener
20
+ attr_reader :plain_text
21
+ def initialize
22
+ @plain_text = ""
23
+ end
24
+ def tag_start(name, attrs)
25
+ @last_name = name
26
+ end
27
+ def text(s)
28
+ if @last_name.index('text')
29
+ s = s.strip
30
+ if s.length > 0
31
+ @plain_text << s
32
+ @plain_text << "\n"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -1,9 +1,12 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Readers
4
+ # A wrapper for the Poppler pdf2text utility, which
5
+ # extracts the text from a PDF file.
4
6
  class PDF
5
- require 'fileutils'
6
- # Read a file using the Poppler pdf2text utility.
7
+ # Read a PDF file using the Poppler pdf2text utility.
8
+ #
9
+ # Options: none.
7
10
  def self.read(document, options = {})
8
11
  create_temp_file(:txt) do |tmp|
9
12
  `pdftotext #{document.file} #{tmp} `.strip
@@ -4,6 +4,8 @@ module Treat
4
4
  # This class simply reads a plain text file.
5
5
  class Txt
6
6
  # Build an entity from a string in plain text format.
7
+ #
8
+ # Options: none.
7
9
  def self.read(document, options = {})
8
10
  f = File.read(document.file)
9
11
  document << Treat::Entities::Entity.from_string(f)
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Serializers
4
- # This class converts an entity to XML format.
4
+ # This class converts an entity to a storable XML format.
5
5
  class XML
6
6
  # Reauire the Nokogiri XML parser.
7
7
  require 'nokogiri'
@@ -9,7 +9,8 @@ module Treat
9
9
  def self.serialize(entity, options = {})
10
10
  options = {:indent => 0} if options[:indent].nil?
11
11
  if options[:indent] == 0
12
- string = '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>'
12
+ enc = entity.encoding(:r_chardet19).to_s.gsub('_', '-').upcase
13
+ string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>"
13
14
  else
14
15
  string = ''
15
16
  end
@@ -6,6 +6,8 @@ module Treat
6
6
  # This class serializes entities in YAML format.
7
7
  class YAML
8
8
  # Serialize an entity in YAML format.
9
+ #
10
+ # Options: none.
9
11
  def self.serialize(entity, options = {})
10
12
  ::Psych.dump(entity)
11
13
  end
@@ -1,7 +1,13 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # This class doesn't perform any unserializing;
5
+ # it simply routes the document to an unserializer
6
+ # based on the file extension of the document.
4
7
  class Autoselect
8
+ # Unserialize any supported file format.
9
+ #
10
+ # Options: none.
5
11
  def self.unserialize(document, options = {})
6
12
  ext = document.file.split('.')[-1]
7
13
  if ext == 'yaml' || ext == 'yml'
@@ -9,7 +15,7 @@ module Treat
9
15
  elsif ext == 'xml'
10
16
  document.unserialize(:xml)
11
17
  else
12
- raise "File #{document.file} was not recognized"+
18
+ raise "File #{document.file} was not recognized "+
13
19
  "as a supported serialized format."
14
20
  end
15
21
  end
@@ -1,9 +1,13 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # Recreates the entity tree corresponding to
5
+ # a serialized XML file.
4
6
  class XML
5
7
  require 'nokogiri'
6
-
8
+ # Unserialize an entity stored in XML format.
9
+ #
10
+ # Options: none.
7
11
  def self.unserialize(document, options = {})
8
12
  # Read in the XML file.
9
13
  xml = File.read(document.file)
@@ -59,6 +63,7 @@ module Treat
59
63
  current_value = xml_reader.value.strip
60
64
  if current_value && current_value != ''
61
65
  current_element.value = current_value
66
+ current_element.register_token(current_element)
62
67
  end
63
68
  end
64
69
 
@@ -1,10 +1,14 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Unserializers
4
+ # This class is a wrapper for the Psych YAML
5
+ # parser; it unserializes YAML files.
4
6
  class YAML
5
7
  # Require the Psych YAML parser.
6
8
  require 'psych'
7
- # Unserialize a YAML file representing an entity.
9
+ # Unserialize a YAML file.
10
+ #
11
+ # Options: none.
8
12
  def self.unserialize(document, options = {})
9
13
  document << ::Psych.load(File.read(document.file))
10
14
  document