wordtree 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in wordtree.gemspec
4
+ gemspec
5
+
6
+ # gem "archivist-client", :path => "../archivist-client"
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, :cmd => 'bundle exec rspec' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Duane Johnson
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # Wordtree
2
+
3
+ This is the WordTree ruby gem for text analysis.
4
+
5
+ ## Installation
6
+
7
+ $ gem install wordtree
8
+
9
+ ## Usage
10
+
11
+ require 'wordtree'
12
+
13
+ library = WordTree::Library.new("/tmp/library")
14
+ librarian = WordTree::Librarian.new(library)
15
+
16
+ Find a book in your on-disk "library":
17
+
18
+ book = librarian.find('firstbooknapole00gruagoog')
19
+ book.metadata
20
+ book.content
21
+
22
+ Modify and save a book to your "library":
23
+
24
+ book.year = 2014
25
+ librarian.save(book)
26
+
27
+ Download a book from Archive.org to your "library":
28
+
29
+ book_id = librarian.archive_org_get('latewarbetween_00hunt')
30
+
31
+
32
+ ## Contributing
33
+
34
+ 1. Fork it
35
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
36
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
37
+ 4. Push to the branch (`git push origin my-new-feature`)
38
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,58 @@
1
+ require 'archivist/client'
2
+ require 'retriable'
3
+
4
+ module WordTree
5
+ class Archdown
6
+ attr_reader :client
7
+
8
+ def initialize
9
+ @client = Archivist::Client::Base.new
10
+ end
11
+
12
+ def metadata_for(archivist_book)
13
+ author = archivist_book.creators ? archivist_book.creators.join('; ') : nil
14
+ {
15
+ 'title' => archivist_book.title,
16
+ 'author' => author,
17
+ 'year' => archivist_book.date.year,
18
+ 'source' => "http://archive.org/details/#{archivist_book.identifier}",
19
+ 'status' => "OCR ONLY",
20
+ 'archive_org_id' => archivist_book.identifier,
21
+ }
22
+ end
23
+
24
+ def content_for(archivist_book)
25
+ [archivist_book.download, nil]
26
+ rescue Archivist::Model::Document::UnsupportedFormat => e
27
+ [nil, e.to_s]
28
+ rescue StandardError => e
29
+ [nil, e.to_s]
30
+ end
31
+
32
+ def download_all(search_terms, &each_book)
33
+ page = 1
34
+ loop do
35
+ archivist_books =
36
+ ::Retriable.retriable(:on => Faraday::Error::TimeoutError) do
37
+ @client.search(search_terms.merge(:page => page))
38
+ end
39
+
40
+ break if archivist_books.empty?
41
+
42
+ archivist_books.each do |archivist_book|
43
+ download(archivist_book, &each_book)
44
+ end
45
+
46
+ page += 1
47
+ end
48
+ end
49
+
50
+ def download(archivist_book, &block)
51
+ metadata = metadata_for(archivist_book)
52
+ content, failure = content_for(archivist_book)
53
+
54
+ yield metadata, content, failure if block_given?
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,38 @@
1
+ require 'virtus'
2
+ require 'wordtree/text_utils'
3
+
4
+ module WordTree
5
+ class Book
6
+ include Virtus.model
7
+
8
+ attribute :id, String, :default => :default_id
9
+ attribute :archive_org_id, String
10
+ attribute :title, String
11
+ attribute :author, String
12
+ attribute :year, Integer
13
+ attribute :source, String
14
+ attribute :status, String
15
+ # Size of the content in bytes
16
+ attribute :size_bytes, Integer
17
+ # A simhash (locality-sensitive hash) of the content
18
+ attribute :simhash, String
19
+
20
+ attribute :content, String
21
+
22
+ def self.create(id, metadata, content)
23
+ new(metadata.merge("id" => id, "content" => content))
24
+ end
25
+
26
+ def default_id
27
+ archive_org_id
28
+ end
29
+
30
+ def metadata
31
+ attributes.select{ |k,v| !v.nil? && k != :content && k != :id }
32
+ end
33
+
34
+ def clean_content
35
+ TextUtils.clean_text(content)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,58 @@
1
+ require 'preamble'
2
+ require 'wordtree/book'
3
+ require 'wordtree/library'
4
+ require 'wordtree/archdown'
5
+
6
+ module WordTree
7
+ class Librarian
8
+ attr_reader :library
9
+
10
+ def initialize(library)
11
+ @library = library
12
+ end
13
+
14
+ def find(book_id)
15
+ retrieved = Preamble.load(library.path_to(book_id))
16
+ Book.create(book_id, retrieved.metadata, retrieved.content)
17
+ end
18
+
19
+ def save(book)
20
+ library.mkdir(book.id)
21
+ Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
22
+ end
23
+
24
+ def archive_org_get_book(book_id, &block)
25
+ archive_org_get({
26
+ :filters => ["identifier:#{book_id}"]
27
+ }, &block)
28
+ end
29
+
30
+ def archive_org_get_range_of_years(start_year, end_year, &block)
31
+ archive_org_get({
32
+ :start_year => start_year,
33
+ :end_year => end_year
34
+ }, &block)
35
+ end
36
+
37
+ # Downloads a set of books to the on-disk library and
38
+ # returns a list of book_ids
39
+ def archive_org_get(conditions, &block)
40
+ archdown = Archdown.new
41
+ [].tap do |archive_org_ids|
42
+ archdown.download_all(conditions) do |metadata, content, failure|
43
+ if failure
44
+ #TODO: logging
45
+ $stderr.puts "Unable to download from archive.org: #{failure}"
46
+ else
47
+ book = Book.create(metadata["archive_org_id"], metadata, content)
48
+ save(book)
49
+ yield book, self if block_given?
50
+ archive_org_ids << book.id
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+
57
+ end
58
+ end
@@ -0,0 +1,41 @@
1
+ require 'fileutils'
2
+
3
+ require 'wordtree/archdown'
4
+ require 'wordtree/library_locator'
5
+
6
+ module WordTree
7
+ class Library
8
+
9
+ FILE_TYPES = {
10
+ :raw => "%s.md"
11
+ }
12
+
13
+ # The file path to the root of the library directory, e.g. /data/library
14
+ attr_reader :root
15
+
16
+ def initialize(root)
17
+ @root = root
18
+ end
19
+
20
+ # returns the full path of a book's subdirectory within the library
21
+ # Accepts either a String or a LibraryLocator object
22
+ def dir_of(book_id)
23
+ File.expand_path(LibraryLocator.identity(book_id).relpath, root)
24
+ end
25
+
26
+ def path_to(book_id, type=:raw)
27
+ File.join(dir_of(book_id), file_type(book_id, type))
28
+ end
29
+
30
+ def file_type(book_id, type=:raw)
31
+ locator = LibraryLocator.identity(book_id)
32
+ FILE_TYPES[type] % locator.id
33
+ end
34
+
35
+ # Create all subdirs up to the location where a book is stored
36
+ # Accepts either a String or a LibraryLocator object
37
+ def mkdir(book_id)
38
+ FileUtils.mkdir_p(dir_of(book_id))
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,44 @@
1
+ module WordTree
2
+ # A class that converts from a book ID to a location within the library, e.g.
3
+ #
4
+ # "firstbooknapole00gruagoog"
5
+ #
6
+ # becomes
7
+ #
8
+ # "fi/og/firstbooknapole00gruagoog/"
9
+ #
10
+ # or, in context of the full path:
11
+ #
12
+ # [/data/library/] "fi/og/firstbooknapole00gruagoog/" [firstbooknapole00gruagoog.md]
13
+ #
14
+ class LibraryLocator
15
+ # The book ID to locate
16
+ attr_reader :id
17
+
18
+ # Construct a LibraryLocator from a string (book ID)
19
+ def initialize(id)
20
+ @id = id
21
+ end
22
+
23
+ def first
24
+ @id[0..1].downcase
25
+ end
26
+
27
+ def last
28
+ @id[-2..-1].downcase
29
+ end
30
+
31
+ # Returns a "relative" path to be joined to the library root,
32
+ # e.g. if the identifier is "firstbooknapole00gruagoog", then relpath
33
+ # should return "fi/og/firstbooknapole00gruagoog", i.e. probably later to
34
+ # become something like "/data/library/fi/og/firstbooknapole00gruagoog"
35
+ def relpath
36
+ File.join(first, last, @id)
37
+ end
38
+
39
+ # Constructor that is as willing to use a String as it is a LibraryLocator
40
+ def self.identity(id)
41
+ id.is_a?(LibraryLocator) ? id : new(id)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,73 @@
1
+ module WordTree
2
+ module TextUtils
3
+ def self.split_near(text, split_index)
4
+ if split_index >= text.size
5
+ return [text, ""]
6
+ else
7
+ index = split_index
8
+ while index >= 0
9
+ if text[index] == ' '
10
+ return [text[0...index], text[(index+1)..-1]]
11
+ end
12
+ index -= 1
13
+ end
14
+ return [text[0...split_index], text[split_index..-1]]
15
+ end
16
+ end
17
+
18
+ # Remove punctuation an non-alphabetical characters from a text, and return
19
+ # a cleaned-up version wrapped at +wrap+ characters per line.
20
+ def self.clean_text(input, wrap=120)
21
+ join = nil
22
+ output = String.new
23
+ output_line = String.new
24
+
25
+ # Ignore non-UTF-8 characters
26
+ input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
27
+
28
+ _0 = '0'.ord
29
+ _9 = '9'.ord
30
+ _a = 'a'.ord
31
+ _z = 'z'.ord
32
+ _A = 'A'.ord
33
+ _Z = 'Z'.ord
34
+ _dash = '-'.ord
35
+ _space = ' '.ord
36
+ _newline = "\n".ord
37
+
38
+ join_lines = false
39
+ just_added_space = false
40
+ line_length = 0
41
+ input.each_char do |c|
42
+ c = c.ord
43
+ # Change upper-case to lower-case
44
+ c -= 32 if (c >= _A && c <= _Z)
45
+ # Change newlines to spaces
46
+ c = _space if c == _newline
47
+
48
+ if c == _dash
49
+ # In case of a dash, set the scoop-spaces-up flag
50
+ join_lines = true
51
+ elsif join_lines && (c == _space)
52
+ # ignore
53
+ elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
54
+ # Add letters and spaces
55
+ output << c.chr
56
+ line_length += 1
57
+ just_added_space = (c == _space)
58
+ join_lines = false
59
+ end
60
+ end
61
+
62
+ wrapped_output = String.new
63
+ begin
64
+ output_line, remainder = split_near(output, wrap)
65
+ wrapped_output << output_line + "\n"
66
+ output = remainder
67
+ end while remainder.size > wrap
68
+ wrapped_output << remainder + "\n" unless remainder.empty?
69
+
70
+ return wrapped_output
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,3 @@
1
+ module Wordtree
2
+ VERSION = "0.0.1"
3
+ end
data/lib/wordtree.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "wordtree/version"
2
+
3
+ module Wordtree
4
+ # Your code goes here...
5
+ end