wordtree 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in wordtree.gemspec
4
+ gemspec
5
+
6
+ # gem "archivist-client", :path => "../archivist-client"
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, :cmd => 'bundle exec rspec' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Duane Johnson
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # Wordtree
2
+
3
+ This is the WordTree ruby gem for text analysis.
4
+
5
+ ## Installation
6
+
7
+ $ gem install wordtree
8
+
9
+ ## Usage
10
+
11
+ require 'wordtree'
12
+
13
+ library = WordTree::Library.new("/tmp/library")
14
+ librarian = WordTree::Librarian.new(library)
15
+
16
+ Find a book in your on-disk "library":
17
+
18
+ book = librarian.find('firstbooknapole00gruagoog')
19
+ book.metadata
20
+ book.content
21
+
22
+ Modify and save a book to your "library":
23
+
24
+ book.year = 2014
25
+ librarian.save(book)
26
+
27
+ Download a book from Archive.org to your "library":
28
+
29
+ book_id = librarian.archive_org_get('latewarbetween_00hunt')
30
+
31
+
32
+ ## Contributing
33
+
34
+ 1. Fork it
35
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
36
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
37
+ 4. Push to the branch (`git push origin my-new-feature`)
38
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,58 @@
1
+ require 'archivist/client'
2
+ require 'retriable'
3
+
4
+ module WordTree
5
+ class Archdown
6
+ attr_reader :client
7
+
8
+ def initialize
9
+ @client = Archivist::Client::Base.new
10
+ end
11
+
12
+ def metadata_for(archivist_book)
13
+ author = archivist_book.creators ? archivist_book.creators.join('; ') : nil
14
+ {
15
+ 'title' => archivist_book.title,
16
+ 'author' => author,
17
+ 'year' => archivist_book.date.year,
18
+ 'source' => "http://archive.org/details/#{archivist_book.identifier}",
19
+ 'status' => "OCR ONLY",
20
+ 'archive_org_id' => archivist_book.identifier,
21
+ }
22
+ end
23
+
24
+ def content_for(archivist_book)
25
+ [archivist_book.download, nil]
26
+ rescue Archivist::Model::Document::UnsupportedFormat => e
27
+ [nil, e.to_s]
28
+ rescue StandardError => e
29
+ [nil, e.to_s]
30
+ end
31
+
32
+ def download_all(search_terms, &each_book)
33
+ page = 1
34
+ loop do
35
+ archivist_books =
36
+ ::Retriable.retriable(:on => Faraday::Error::TimeoutError) do
37
+ @client.search(search_terms.merge(:page => page))
38
+ end
39
+
40
+ break if archivist_books.empty?
41
+
42
+ archivist_books.each do |archivist_book|
43
+ download(archivist_book, &each_book)
44
+ end
45
+
46
+ page += 1
47
+ end
48
+ end
49
+
50
+ def download(archivist_book, &block)
51
+ metadata = metadata_for(archivist_book)
52
+ content, failure = content_for(archivist_book)
53
+
54
+ yield metadata, content, failure if block_given?
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,38 @@
1
+ require 'virtus'
2
+ require 'wordtree/text_utils'
3
+
4
+ module WordTree
5
+ class Book
6
+ include Virtus.model
7
+
8
+ attribute :id, String, :default => :default_id
9
+ attribute :archive_org_id, String
10
+ attribute :title, String
11
+ attribute :author, String
12
+ attribute :year, Integer
13
+ attribute :source, String
14
+ attribute :status, String
15
+ # Size of the content in bytes
16
+ attribute :size_bytes, Integer
17
+ # A simhash (locality-sensitive hash) of the content
18
+ attribute :simhash, String
19
+
20
+ attribute :content, String
21
+
22
+ def self.create(id, metadata, content)
23
+ new(metadata.merge("id" => id, "content" => content))
24
+ end
25
+
26
+ def default_id
27
+ archive_org_id
28
+ end
29
+
30
+ def metadata
31
+ attributes.select{ |k,v| !v.nil? && k != :content && k != :id }
32
+ end
33
+
34
+ def clean_content
35
+ TextUtils.clean_text(content)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,58 @@
1
+ require 'preamble'
2
+ require 'wordtree/book'
3
+ require 'wordtree/library'
4
+ require 'wordtree/archdown'
5
+
6
+ module WordTree
7
+ class Librarian
8
+ attr_reader :library
9
+
10
+ def initialize(library)
11
+ @library = library
12
+ end
13
+
14
+ def find(book_id)
15
+ retrieved = Preamble.load(library.path_to(book_id))
16
+ Book.create(book_id, retrieved.metadata, retrieved.content)
17
+ end
18
+
19
+ def save(book)
20
+ library.mkdir(book.id)
21
+ Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
22
+ end
23
+
24
+ def archive_org_get_book(book_id, &block)
25
+ archive_org_get({
26
+ :filters => ["identifier:#{book_id}"]
27
+ }, &block)
28
+ end
29
+
30
+ def archive_org_get_range_of_years(start_year, end_year, &block)
31
+ archive_org_get({
32
+ :start_year => start_year,
33
+ :end_year => end_year
34
+ }, &block)
35
+ end
36
+
37
+ # Downloads a set of books to the on-disk library and
38
+ # returns a list of book_ids
39
+ def archive_org_get(conditions, &block)
40
+ archdown = Archdown.new
41
+ [].tap do |archive_org_ids|
42
+ archdown.download_all(conditions) do |metadata, content, failure|
43
+ if failure
44
+ #TODO: logging
45
+ $stderr.puts "Unable to download from archive.org: #{failure}"
46
+ else
47
+ book = Book.create(metadata["archive_org_id"], metadata, content)
48
+ save(book)
49
+ yield book, self if block_given?
50
+ archive_org_ids << book.id
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+
57
+ end
58
+ end
@@ -0,0 +1,41 @@
1
+ require 'fileutils'
2
+
3
+ require 'wordtree/archdown'
4
+ require 'wordtree/library_locator'
5
+
6
+ module WordTree
7
+ class Library
8
+
9
+ FILE_TYPES = {
10
+ :raw => "%s.md"
11
+ }
12
+
13
+ # The file path to the root of the library directory, e.g. /data/library
14
+ attr_reader :root
15
+
16
+ def initialize(root)
17
+ @root = root
18
+ end
19
+
20
+ # returns the full path of a book's subdirectory within the library
21
+ # Accepts either a String or a LibraryLocator object
22
+ def dir_of(book_id)
23
+ File.expand_path(LibraryLocator.identity(book_id).relpath, root)
24
+ end
25
+
26
+ def path_to(book_id, type=:raw)
27
+ File.join(dir_of(book_id), file_type(book_id, type))
28
+ end
29
+
30
+ def file_type(book_id, type=:raw)
31
+ locator = LibraryLocator.identity(book_id)
32
+ FILE_TYPES[type] % locator.id
33
+ end
34
+
35
+ # Create all subdirs up to the location where a book is stored
36
+ # Accepts either a String or a LibraryLocator object
37
+ def mkdir(book_id)
38
+ FileUtils.mkdir_p(dir_of(book_id))
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,44 @@
1
+ module WordTree
2
+ # A class that converts from a book ID to a location within the library, e.g.
3
+ #
4
+ # "firstbooknapole00gruagoog"
5
+ #
6
+ # becomes
7
+ #
8
+ # "fi/og/firstbooknapole00gruagoog/"
9
+ #
10
+ # or, in context of the full path:
11
+ #
12
+ # [/data/library/] "fi/og/firstbooknapole00gruagoog/" [firstbooknapole00gruagoog.md]
13
+ #
14
+ class LibraryLocator
15
+ # The book ID to locate
16
+ attr_reader :id
17
+
18
+ # Construct a LibraryLocator from a string (book ID)
19
+ def initialize(id)
20
+ @id = id
21
+ end
22
+
23
+ def first
24
+ @id[0..1].downcase
25
+ end
26
+
27
+ def last
28
+ @id[-2..-1].downcase
29
+ end
30
+
31
+ # Returns a "relative" path to be joined to the library root,
32
+ # e.g. if the identifier is "firstbooknapole00gruagoog", then relpath
33
+ # should return "fi/og/firstbooknapole00gruagoog", i.e. probably later to
34
+ # become something like "/data/library/fi/og/firstbooknapole00gruagoog"
35
+ def relpath
36
+ File.join(first, last, @id)
37
+ end
38
+
39
+ # Constructor that is as willing to use a String as it is a LibraryLocator
40
+ def self.identity(id)
41
+ id.is_a?(LibraryLocator) ? id : new(id)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,73 @@
1
+ module WordTree
2
+ module TextUtils
3
+ def self.split_near(text, split_index)
4
+ if split_index >= text.size
5
+ return [text, ""]
6
+ else
7
+ index = split_index
8
+ while index >= 0
9
+ if text[index] == ' '
10
+ return [text[0...index], text[(index+1)..-1]]
11
+ end
12
+ index -= 1
13
+ end
14
+ return [text[0...split_index], text[split_index..-1]]
15
+ end
16
+ end
17
+
18
+ # Remove punctuation an non-alphabetical characters from a text, and return
19
+ # a cleaned-up version wrapped at +wrap+ characters per line.
20
+ def self.clean_text(input, wrap=120)
21
+ join = nil
22
+ output = String.new
23
+ output_line = String.new
24
+
25
+ # Ignore non-UTF-8 characters
26
+ input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
27
+
28
+ _0 = '0'.ord
29
+ _9 = '9'.ord
30
+ _a = 'a'.ord
31
+ _z = 'z'.ord
32
+ _A = 'A'.ord
33
+ _Z = 'Z'.ord
34
+ _dash = '-'.ord
35
+ _space = ' '.ord
36
+ _newline = "\n".ord
37
+
38
+ join_lines = false
39
+ just_added_space = false
40
+ line_length = 0
41
+ input.each_char do |c|
42
+ c = c.ord
43
+ # Change upper-case to lower-case
44
+ c -= 32 if (c >= _A && c <= _Z)
45
+ # Change newlines to spaces
46
+ c = _space if c == _newline
47
+
48
+ if c == _dash
49
+ # In case of a dash, set the scoop-spaces-up flag
50
+ join_lines = true
51
+ elsif join_lines && (c == _space)
52
+ # ignore
53
+ elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
54
+ # Add letters and spaces
55
+ output << c.chr
56
+ line_length += 1
57
+ just_added_space = (c == _space)
58
+ join_lines = false
59
+ end
60
+ end
61
+
62
+ wrapped_output = String.new
63
+ begin
64
+ output_line, remainder = split_near(output, wrap)
65
+ wrapped_output << output_line + "\n"
66
+ output = remainder
67
+ end while remainder.size > wrap
68
+ wrapped_output << remainder + "\n" unless remainder.empty?
69
+
70
+ return wrapped_output
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,3 @@
1
+ module Wordtree
2
+ VERSION = "0.0.1"
3
+ end
data/lib/wordtree.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "wordtree/version"
2
+
3
+ module Wordtree
4
+ # Your code goes here...
5
+ end