wordtree 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +38 -0
- data/Rakefile +1 -0
- data/lib/wordtree/archdown.rb +58 -0
- data/lib/wordtree/book.rb +38 -0
- data/lib/wordtree/librarian.rb +58 -0
- data/lib/wordtree/library.rb +41 -0
- data/lib/wordtree/library_locator.rb +44 -0
- data/lib/wordtree/text_utils.rb +73 -0
- data/lib/wordtree/version.rb +3 -0
- data/lib/wordtree.rb +5 -0
- data/spec/fixtures/cassettes/archive_org_download_book.yml +1627 -0
- data/spec/fixtures/library/bo/ok/book/book.md +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/wordtree/book_spec.rb +19 -0
- data/spec/wordtree/librarian_spec.rb +48 -0
- data/spec/wordtree/library_spec.rb +23 -0
- data/spec/wordtree/text_utils_spec.rb +54 -0
- data/wordtree.gemspec +34 -0
- metadata +269 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Duane Johnson
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Wordtree
|
2
|
+
|
3
|
+
This is the WordTree ruby gem for text analysis.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
$ gem install wordtree
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
require 'wordtree'
|
12
|
+
|
13
|
+
library = WordTree::Library.new("/tmp/library")
|
14
|
+
librarian = WordTree::Librarian.new(library)
|
15
|
+
|
16
|
+
Find a book in your on-disk "library":
|
17
|
+
|
18
|
+
book = librarian.find('firstbooknapole00gruagoog')
|
19
|
+
book.metadata
|
20
|
+
book.content
|
21
|
+
|
22
|
+
Modify and save a book to your "library":
|
23
|
+
|
24
|
+
book.year = 2014
|
25
|
+
librarian.save(book)
|
26
|
+
|
27
|
+
Download a book from Archive.org to your "library":
|
28
|
+
|
29
|
+
book_id = librarian.archive_org_get('latewarbetween_00hunt')
|
30
|
+
|
31
|
+
|
32
|
+
## Contributing
|
33
|
+
|
34
|
+
1. Fork it
|
35
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
36
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
37
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
38
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'archivist/client'
|
2
|
+
require 'retriable'
|
3
|
+
|
4
|
+
module WordTree
|
5
|
+
class Archdown
|
6
|
+
attr_reader :client
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@client = Archivist::Client::Base.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def metadata_for(archivist_book)
|
13
|
+
author = archivist_book.creators ? archivist_book.creators.join('; ') : nil
|
14
|
+
{
|
15
|
+
'title' => archivist_book.title,
|
16
|
+
'author' => author,
|
17
|
+
'year' => archivist_book.date.year,
|
18
|
+
'source' => "http://archive.org/details/#{archivist_book.identifier}",
|
19
|
+
'status' => "OCR ONLY",
|
20
|
+
'archive_org_id' => archivist_book.identifier,
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
def content_for(archivist_book)
|
25
|
+
[archivist_book.download, nil]
|
26
|
+
rescue Archivist::Model::Document::UnsupportedFormat => e
|
27
|
+
[nil, e.to_s]
|
28
|
+
rescue StandardError => e
|
29
|
+
[nil, e.to_s]
|
30
|
+
end
|
31
|
+
|
32
|
+
def download_all(search_terms, &each_book)
|
33
|
+
page = 1
|
34
|
+
loop do
|
35
|
+
archivist_books =
|
36
|
+
::Retriable.retriable(:on => Faraday::Error::TimeoutError) do
|
37
|
+
@client.search(search_terms.merge(:page => page))
|
38
|
+
end
|
39
|
+
|
40
|
+
break if archivist_books.empty?
|
41
|
+
|
42
|
+
archivist_books.each do |archivist_book|
|
43
|
+
download(archivist_book, &each_book)
|
44
|
+
end
|
45
|
+
|
46
|
+
page += 1
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def download(archivist_book, &block)
|
51
|
+
metadata = metadata_for(archivist_book)
|
52
|
+
content, failure = content_for(archivist_book)
|
53
|
+
|
54
|
+
yield metadata, content, failure if block_given?
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'virtus'
|
2
|
+
require 'wordtree/text_utils'
|
3
|
+
|
4
|
+
module WordTree
|
5
|
+
class Book
|
6
|
+
include Virtus.model
|
7
|
+
|
8
|
+
attribute :id, String, :default => :default_id
|
9
|
+
attribute :archive_org_id, String
|
10
|
+
attribute :title, String
|
11
|
+
attribute :author, String
|
12
|
+
attribute :year, Integer
|
13
|
+
attribute :source, String
|
14
|
+
attribute :status, String
|
15
|
+
# Size of the content in bytes
|
16
|
+
attribute :size_bytes, Integer
|
17
|
+
# A simhash (locality-sensitive hash) of the content
|
18
|
+
attribute :simhash, String
|
19
|
+
|
20
|
+
attribute :content, String
|
21
|
+
|
22
|
+
def self.create(id, metadata, content)
|
23
|
+
new(metadata.merge("id" => id, "content" => content))
|
24
|
+
end
|
25
|
+
|
26
|
+
def default_id
|
27
|
+
archive_org_id
|
28
|
+
end
|
29
|
+
|
30
|
+
def metadata
|
31
|
+
attributes.select{ |k,v| !v.nil? && k != :content && k != :id }
|
32
|
+
end
|
33
|
+
|
34
|
+
def clean_content
|
35
|
+
TextUtils.clean_text(content)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'preamble'
|
2
|
+
require 'wordtree/book'
|
3
|
+
require 'wordtree/library'
|
4
|
+
require 'wordtree/archdown'
|
5
|
+
|
6
|
+
module WordTree
|
7
|
+
class Librarian
|
8
|
+
attr_reader :library
|
9
|
+
|
10
|
+
def initialize(library)
|
11
|
+
@library = library
|
12
|
+
end
|
13
|
+
|
14
|
+
def find(book_id)
|
15
|
+
retrieved = Preamble.load(library.path_to(book_id))
|
16
|
+
Book.create(book_id, retrieved.metadata, retrieved.content)
|
17
|
+
end
|
18
|
+
|
19
|
+
def save(book)
|
20
|
+
library.mkdir(book.id)
|
21
|
+
Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
|
22
|
+
end
|
23
|
+
|
24
|
+
def archive_org_get_book(book_id, &block)
|
25
|
+
archive_org_get({
|
26
|
+
:filters => ["identifier:#{book_id}"]
|
27
|
+
}, &block)
|
28
|
+
end
|
29
|
+
|
30
|
+
def archive_org_get_range_of_years(start_year, end_year, &block)
|
31
|
+
archive_org_get({
|
32
|
+
:start_year => start_year,
|
33
|
+
:end_year => end_year
|
34
|
+
}, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Downloads a set of books to the on-disk library and
|
38
|
+
# returns a list of book_ids
|
39
|
+
def archive_org_get(conditions, &block)
|
40
|
+
archdown = Archdown.new
|
41
|
+
[].tap do |archive_org_ids|
|
42
|
+
archdown.download_all(conditions) do |metadata, content, failure|
|
43
|
+
if failure
|
44
|
+
#TODO: logging
|
45
|
+
$stderr.puts "Unable to download from archive.org: #{failure}"
|
46
|
+
else
|
47
|
+
book = Book.create(metadata["archive_org_id"], metadata, content)
|
48
|
+
save(book)
|
49
|
+
yield book, self if block_given?
|
50
|
+
archive_org_ids << book.id
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
require 'wordtree/archdown'
|
4
|
+
require 'wordtree/library_locator'
|
5
|
+
|
6
|
+
module WordTree
|
7
|
+
class Library
|
8
|
+
|
9
|
+
FILE_TYPES = {
|
10
|
+
:raw => "%s.md"
|
11
|
+
}
|
12
|
+
|
13
|
+
# The file path to the root of the library directory, e.g. /data/library
|
14
|
+
attr_reader :root
|
15
|
+
|
16
|
+
def initialize(root)
|
17
|
+
@root = root
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns the full path of a book's subdirectory within the library
|
21
|
+
# Accepts either a String or a LibraryLocator object
|
22
|
+
def dir_of(book_id)
|
23
|
+
File.expand_path(LibraryLocator.identity(book_id).relpath, root)
|
24
|
+
end
|
25
|
+
|
26
|
+
def path_to(book_id, type=:raw)
|
27
|
+
File.join(dir_of(book_id), file_type(book_id, type))
|
28
|
+
end
|
29
|
+
|
30
|
+
def file_type(book_id, type=:raw)
|
31
|
+
locator = LibraryLocator.identity(book_id)
|
32
|
+
FILE_TYPES[type] % locator.id
|
33
|
+
end
|
34
|
+
|
35
|
+
# Create all subdirs up to the location where a book is stored
|
36
|
+
# Accepts either a String or a LibraryLocator object
|
37
|
+
def mkdir(book_id)
|
38
|
+
FileUtils.mkdir_p(dir_of(book_id))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module WordTree
|
2
|
+
# A class that converts from a book ID to a location within the library, e.g.
|
3
|
+
#
|
4
|
+
# "firstbooknapole00gruagoog"
|
5
|
+
#
|
6
|
+
# becomes
|
7
|
+
#
|
8
|
+
# "fi/og/firstbooknapole00gruagoog/"
|
9
|
+
#
|
10
|
+
# or, in context of the full path:
|
11
|
+
#
|
12
|
+
# [/data/library/] "fi/og/firstbooknapole00gruagoog/" [firstbooknapole00gruagoog.md]
|
13
|
+
#
|
14
|
+
class LibraryLocator
|
15
|
+
# The book ID to locate
|
16
|
+
attr_reader :id
|
17
|
+
|
18
|
+
# Construct a LibraryLocator from a string (book ID)
|
19
|
+
def initialize(id)
|
20
|
+
@id = id
|
21
|
+
end
|
22
|
+
|
23
|
+
def first
|
24
|
+
@id[0..1].downcase
|
25
|
+
end
|
26
|
+
|
27
|
+
def last
|
28
|
+
@id[-2..-1].downcase
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns a "relative" path to be joined to the library root,
|
32
|
+
# e.g. if the identifier is "firstbooknapole00gruagoog", then relpath
|
33
|
+
# should return "fi/og/firstbooknapole00gruagoog", i.e. probably later to
|
34
|
+
# become something like "/data/library/fi/og/firstbooknapole00gruagoog"
|
35
|
+
def relpath
|
36
|
+
File.join(first, last, @id)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Constructor that is as willing to use a String as it is a LibraryLocator
|
40
|
+
def self.identity(id)
|
41
|
+
id.is_a?(LibraryLocator) ? id : new(id)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module WordTree
|
2
|
+
module TextUtils
|
3
|
+
def self.split_near(text, split_index)
|
4
|
+
if split_index >= text.size
|
5
|
+
return [text, ""]
|
6
|
+
else
|
7
|
+
index = split_index
|
8
|
+
while index >= 0
|
9
|
+
if text[index] == ' '
|
10
|
+
return [text[0...index], text[(index+1)..-1]]
|
11
|
+
end
|
12
|
+
index -= 1
|
13
|
+
end
|
14
|
+
return [text[0...split_index], text[split_index..-1]]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Remove punctuation an non-alphabetical characters from a text, and return
|
19
|
+
# a cleaned-up version wrapped at +wrap+ characters per line.
|
20
|
+
def self.clean_text(input, wrap=120)
|
21
|
+
join = nil
|
22
|
+
output = String.new
|
23
|
+
output_line = String.new
|
24
|
+
|
25
|
+
# Ignore non-UTF-8 characters
|
26
|
+
input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
|
27
|
+
|
28
|
+
_0 = '0'.ord
|
29
|
+
_9 = '9'.ord
|
30
|
+
_a = 'a'.ord
|
31
|
+
_z = 'z'.ord
|
32
|
+
_A = 'A'.ord
|
33
|
+
_Z = 'Z'.ord
|
34
|
+
_dash = '-'.ord
|
35
|
+
_space = ' '.ord
|
36
|
+
_newline = "\n".ord
|
37
|
+
|
38
|
+
join_lines = false
|
39
|
+
just_added_space = false
|
40
|
+
line_length = 0
|
41
|
+
input.each_char do |c|
|
42
|
+
c = c.ord
|
43
|
+
# Change upper-case to lower-case
|
44
|
+
c -= 32 if (c >= _A && c <= _Z)
|
45
|
+
# Change newlines to spaces
|
46
|
+
c = _space if c == _newline
|
47
|
+
|
48
|
+
if c == _dash
|
49
|
+
# In case of a dash, set the scoop-spaces-up flag
|
50
|
+
join_lines = true
|
51
|
+
elsif join_lines && (c == _space)
|
52
|
+
# ignore
|
53
|
+
elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
|
54
|
+
# Add letters and spaces
|
55
|
+
output << c.chr
|
56
|
+
line_length += 1
|
57
|
+
just_added_space = (c == _space)
|
58
|
+
join_lines = false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
wrapped_output = String.new
|
63
|
+
begin
|
64
|
+
output_line, remainder = split_near(output, wrap)
|
65
|
+
wrapped_output << output_line + "\n"
|
66
|
+
output = remainder
|
67
|
+
end while remainder.size > wrap
|
68
|
+
wrapped_output << remainder + "\n" unless remainder.empty?
|
69
|
+
|
70
|
+
return wrapped_output
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|