wordtree 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
4
+ data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
5
+ SHA512:
6
+ metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
7
+ data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
data/ext/Makefile ADDED
@@ -0,0 +1,239 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ ECHO1 = $(V:1=@:)
9
+ ECHO = $(ECHO1:0=@echo)
10
+
11
+ #### Start of system configuration section. ####
12
+
13
+ srcdir = .
14
+ topdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0
15
+ hdrdir = $(topdir)
16
+ arch_hdrdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0/x86_64-darwin13.0
17
+ PATH_SEPARATOR = :
18
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
19
+ prefix = $(DESTDIR)/Users/duane/.rbenv/versions/2.1.3
20
+ rubysitearchprefix = $(rubylibprefix)/$(sitearch)
21
+ rubyarchprefix = $(rubylibprefix)/$(arch)
22
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
23
+ exec_prefix = $(prefix)
24
+ vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
25
+ sitearchhdrdir = $(sitehdrdir)/$(sitearch)
26
+ rubyarchhdrdir = $(rubyhdrdir)/$(arch)
27
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
28
+ sitehdrdir = $(rubyhdrdir)/site_ruby
29
+ rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
30
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
31
+ vendorlibdir = $(vendordir)/$(ruby_version)
32
+ vendordir = $(rubylibprefix)/vendor_ruby
33
+ sitearchdir = $(sitelibdir)/$(sitearch)
34
+ sitelibdir = $(sitedir)/$(ruby_version)
35
+ sitedir = $(rubylibprefix)/site_ruby
36
+ rubyarchdir = $(rubylibdir)/$(arch)
37
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
38
+ sitearchincludedir = $(includedir)/$(sitearch)
39
+ archincludedir = $(includedir)/$(arch)
40
+ sitearchlibdir = $(libdir)/$(sitearch)
41
+ archlibdir = $(libdir)/$(arch)
42
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
43
+ mandir = $(datarootdir)/man
44
+ localedir = $(datarootdir)/locale
45
+ libdir = $(exec_prefix)/lib
46
+ psdir = $(docdir)
47
+ pdfdir = $(docdir)
48
+ dvidir = $(docdir)
49
+ htmldir = $(docdir)
50
+ infodir = $(datarootdir)/info
51
+ docdir = $(datarootdir)/doc/$(PACKAGE)
52
+ oldincludedir = $(DESTDIR)/usr/include
53
+ includedir = $(prefix)/include
54
+ localstatedir = $(prefix)/var
55
+ sharedstatedir = $(prefix)/com
56
+ sysconfdir = $(prefix)/etc
57
+ datadir = $(datarootdir)
58
+ datarootdir = $(prefix)/share
59
+ libexecdir = $(exec_prefix)/libexec
60
+ sbindir = $(exec_prefix)/sbin
61
+ bindir = $(exec_prefix)/bin
62
+ archdir = $(rubyarchdir)
63
+
64
+
65
+ CC = clang
66
+ CXX = g++
67
+ LIBRUBY = $(LIBRUBY_A)
68
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
69
+ LIBRUBYARG_SHARED =
70
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation
71
+ empty =
72
+ OUTFLAG = -o $(empty)
73
+ COUTFLAG = -o $(empty)
74
+
75
+ RUBY_EXTCONF_H =
76
+ cflags = $(optflags) $(debugflags) $(warnflags)
77
+ optflags = -O3 -fno-fast-math
78
+ debugflags = -ggdb3
79
+ warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wunused-variable -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration -Wdivision-by-zero -Wextra-tokens
80
+ CCDLFLAGS = -fno-common
81
+ CFLAGS = $(CCDLFLAGS) -O3 -Wno-error=shorten-64-to-32 -pipe $(ARCH_FLAG)
82
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
83
+ DEFS =
84
+ CPPFLAGS = -I/Users/duane/.rbenv/versions/2.1.3/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
85
+ CXXFLAGS = $(CCDLFLAGS) $(cxxflags) $(ARCH_FLAG)
86
+ ldflags = -L. -L/Users/duane/.rbenv/versions/2.1.3/lib -fstack-protector -L/usr/local/lib
87
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress
88
+ ARCH_FLAG =
89
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
90
+ LDSHARED = $(CC) -dynamic -bundle
91
+ LDSHAREDXX = $(CXX) -dynamic -bundle
92
+ AR = ar
93
+ EXEEXT =
94
+
95
+ RUBY_INSTALL_NAME = ruby
96
+ RUBY_SO_NAME = ruby
97
+ RUBYW_INSTALL_NAME =
98
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
99
+ RUBYW_BASE_NAME = rubyw
100
+ RUBY_BASE_NAME = ruby
101
+
102
+ arch = x86_64-darwin13.0
103
+ sitearch = $(arch)
104
+ ruby_version = 2.1.0
105
+ ruby = $(bindir)/ruby
106
+ RUBY = $(ruby)
107
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
108
+
109
+ RM = rm -f
110
+ RM_RF = $(RUBY) -run -e rm -- -rf
111
+ RMDIRS = rmdir -p
112
+ MAKEDIRS = mkdir -p
113
+ INSTALL = /usr/bin/install -c
114
+ INSTALL_PROG = $(INSTALL) -m 0755
115
+ INSTALL_DATA = $(INSTALL) -m 644
116
+ COPY = cp
117
+ TOUCH = exit >
118
+
119
+ #### End of system configuration section. ####
120
+
121
+ preload =
122
+
123
+ libpath = . $(libdir)
124
+ LIBPATH = -L. -L$(libdir)
125
+ DEFFILE =
126
+
127
+ CLEANFILES = mkmf.log
128
+ DISTCLEANFILES =
129
+ DISTCLEANDIRS =
130
+
131
+ extout =
132
+ extout_prefix =
133
+ target_prefix =
134
+ LOCAL_LIBS =
135
+ LIBS = -lpthread -lgmp -ldl -lobjc
136
+ ORIG_SRCS = wordtree.cc
137
+ SRCS = $(ORIG_SRCS)
138
+ OBJS = wordtree.o
139
+ HDRS =
140
+ TARGET = wordtree
141
+ TARGET_NAME = wordtree
142
+ TARGET_ENTRY = Init_$(TARGET_NAME)
143
+ DLLIB = $(TARGET).bundle
144
+ EXTSTATIC =
145
+ STATIC_LIB =
146
+
147
+ TIMESTAMP_DIR = .
148
+ BINDIR = $(bindir)
149
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
150
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
151
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
152
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
153
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
154
+
155
+ TARGET_SO = $(DLLIB)
156
+ CLEANLIBS = $(TARGET).bundle
157
+ CLEANOBJS = *.o *.bak
158
+
159
+ all: $(DLLIB)
160
+ static: $(STATIC_LIB)
161
+ .PHONY: all install static install-so install-rb
162
+ .PHONY: clean clean-so clean-static clean-rb
163
+
164
+ clean-static::
165
+ clean-rb-default::
166
+ clean-rb::
167
+ clean-so::
168
+ clean: clean-so clean-static clean-rb-default clean-rb
169
+ -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
170
+
171
+ distclean-rb-default::
172
+ distclean-rb::
173
+ distclean-so::
174
+ distclean-static::
175
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
176
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
177
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
178
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
179
+
180
+ realclean: distclean
181
+ install: install-so install-rb
182
+
183
+ install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
184
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
185
+ clean-static::
186
+ -$(Q)$(RM) $(STATIC_LIB)
187
+ install-rb: pre-install-rb install-rb-default
188
+ install-rb-default: pre-install-rb-default
189
+ pre-install-rb: Makefile
190
+ pre-install-rb-default: Makefile
191
+ pre-install-rb-default:
192
+ $(ECHO) installing default wordtree libraries
193
+ $(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
194
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
195
+ $(Q) $(TOUCH) $@
196
+
197
+ site-install: site-install-so site-install-rb
198
+ site-install-so: install-so
199
+ site-install-rb: install-rb
200
+
201
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
202
+
203
+ .cc.o:
204
+ $(ECHO) compiling $(<)
205
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
206
+
207
+ .mm.o:
208
+ $(ECHO) compiling $(<)
209
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
210
+
211
+ .cxx.o:
212
+ $(ECHO) compiling $(<)
213
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
214
+
215
+ .cpp.o:
216
+ $(ECHO) compiling $(<)
217
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
218
+
219
+ .C.o:
220
+ $(ECHO) compiling $(<)
221
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
222
+
223
+ .c.o:
224
+ $(ECHO) compiling $(<)
225
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
226
+
227
+ .m.o:
228
+ $(ECHO) compiling $(<)
229
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
230
+
231
+ $(DLLIB): $(OBJS) Makefile
232
+ $(ECHO) linking shared-object $(DLLIB)
233
+ -$(Q)$(RM) $(@)
234
+ $(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
235
+ $(Q) $(POSTLINK)
236
+
237
+
238
+
239
+ $(OBJS): $(HDRS) $(ruby_headers)
data/ext/extconf.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+
3
+ dir_config("wordtree")
4
+
5
+ create_makefile("wordtree")
data/ext/wordtree.cc ADDED
@@ -0,0 +1,125 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdio.h>
4
+ #include <string.h>
5
+ #include <assert.h>
6
+
7
+ // for rubinius
8
+ #ifndef rb_enc_fast_mbclen
9
+ # define rb_enc_fast_mbclen rb_enc_mbclen
10
+ #endif
11
+
12
+ static rb_encoding* u8_enc;
13
+ static rb_encoding* bin_enc;
14
+
15
+ /** Transforms text such as the following:
16
+ *
17
+ * And behold, I said, "This is no good!"
18
+ * What shall ye say unto these people, there-
19
+ * fore?
20
+ *
21
+ * Into a cleaned up single line of text, like the following:
22
+ *
23
+ * and behold i said this is no good.what shall ye say unto these people therefore.
24
+ *
25
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
26
+ */
27
+ size_t text_clean_cstr(char* text)
28
+ {
29
+ if (*text == '\0') return 0;
30
+
31
+ char* read;
32
+ char* write = text;
33
+ uint8_t join_lines = false,
34
+ just_added_space = true, // prevent prefix spaces
35
+ just_added_period = false;
36
+ for (read=text; *read; read++) {
37
+ char c = *read;
38
+ if (c >= 'A' && c <= 'Z') {
39
+ // Change upper case to lowercase
40
+ c += 32;
41
+ } else if (c == '\n') {
42
+ // Change newlines to spaces (i.e. both count as whitespace)
43
+ c = ' ';
44
+ } else if (c == '?' || c == '!') {
45
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
46
+ c = '.';
47
+ }
48
+
49
+ if (c == '-') {
50
+ join_lines = true;
51
+ } else if (join_lines && c == ' ') {
52
+ // ignore whitespace after a dash (i.e. including newlines, which is the
53
+ // most common case because words that are broken by syllables are dashed)
54
+ } else if (c == '.' && !just_added_period) {
55
+ // erase space before period
56
+ if (just_added_space) write--;
57
+ *write++ = '.';
58
+ just_added_period = true;
59
+ just_added_space = false;
60
+ join_lines = false;
61
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
62
+ *write++ = ' ';
63
+ just_added_space = true;
64
+ just_added_period = false;
65
+ } else if (c >= 'a' && c <= 'z') {
66
+ *write++ = c;
67
+ just_added_space = false;
68
+ just_added_period = false;
69
+ join_lines = false;
70
+ }
71
+ }
72
+ // erase space at end of text
73
+ if (just_added_space) write--;
74
+
75
+ // Return the new length of the string
76
+ return (size_t)(write - text);
77
+ }
78
+
79
+ static VALUE text_common_trigrams(VALUE self, VALUE text) {
80
+ char* ptext = RSTRING_PTR(text);
81
+ long len = RSTRING_LEN(text);
82
+
83
+ if (len < 3) return INT2NUM(0);
84
+
85
+ /* 28 most common English trigrams, all squished together */
86
+ char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
87
+
88
+ char* ptr = ptext;
89
+ char* tail = ptext + len;
90
+ int i = 0, common_matched = 0;
91
+ while (ptr < tail) {
92
+ for (i = 0; i < sizeof(common_trigrams); i += 3) {
93
+ if (memcmp(ptr, common_trigrams + i, 3) == 0) {
94
+ common_matched++;
95
+ break;
96
+ }
97
+ }
98
+ ptr++;
99
+ }
100
+
101
+ return INT2NUM(common_matched);
102
+ }
103
+
104
+ static VALUE text_clean(VALUE self, VALUE text) {
105
+ rb_str_modify(text);
106
+
107
+ char* ctext = StringValueCStr(text);
108
+ size_t new_length = text_clean_cstr(ctext);
109
+
110
+ rb_str_set_len(text, (long)new_length);
111
+
112
+ return text;
113
+ }
114
+
115
+ extern "C"
116
+ void Init_wordtree() {
117
+ VALUE rb_mWordTree = rb_define_module("WordTree");
118
+ VALUE rb_mText = rb_define_module_under(rb_mWordTree, "Text");
119
+
120
+ u8_enc = rb_utf8_encoding();
121
+ bin_enc = rb_ascii8bit_encoding();
122
+
123
+ rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
124
+ rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
125
+ }
@@ -23,10 +23,8 @@ module WordTree
23
23
 
24
24
  def content_for(archivist_book)
25
25
  [archivist_book.download, nil]
26
- rescue Archivist::Model::Document::UnsupportedFormat => e
27
- [nil, e.to_s]
28
- rescue StandardError => e
29
- [nil, e.to_s]
26
+ rescue StandardError, Archivist::Model::Document::UnsupportedFormat => e
27
+ [nil, e]
30
28
  end
31
29
 
32
30
  def download_all(search_terms, &each_book)
data/lib/wordtree/book.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'virtus'
2
2
  require 'simhash'
3
+ require 'set'
3
4
 
4
- require 'wordtree/text_utils'
5
+ require 'wordtree/text'
5
6
 
6
7
  module WordTree
7
8
  class Book
@@ -18,12 +19,12 @@ module WordTree
18
19
  attribute :size_bytes, Integer, :default => :content_size
19
20
  # A simhash (locality-sensitive hash) of the content
20
21
  attribute :simhash, Integer
22
+ attribute :ngrams_counted, Set
21
23
 
22
24
  attribute :content, String
23
25
 
24
26
  def initialize(*args)
25
27
  super
26
- @ngrams = {}
27
28
  end
28
29
 
29
30
  def self.create(id, metadata, content)
@@ -38,44 +39,17 @@ module WordTree
38
39
  attributes.select{ |k,v| !v.nil? && k != :content }
39
40
  end
40
41
 
41
- def content_clean(wrap=120)
42
- if @content_clean_wrap != wrap
43
- # Memoize content_clean (using last wrap size)
44
- @content_clean_wrap = wrap
45
- @content_clean = TextUtils.clean_text(content, wrap)
46
- end
47
- @content_clean
42
+ # Modify and clean content in-place (slightly faster)
43
+ def content_clean!
44
+ WordTree::Text.clean(content)
48
45
  end
49
46
 
50
- def content_size
51
- content ? content.size : nil
52
- end
53
-
54
- def each_ngram(n=1, &block)
55
- TextUtils.each_ngram(content_clean, n, &block)
56
- end
57
-
58
- def set_ngrams(n, lookup)
59
- raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
60
- @ngrams[n] = lookup
47
+ def content_clean
48
+ @content_clean ||= WordTree::Text.clean(content.dup)
61
49
  end
62
50
 
63
- def ngrams(n=1)
64
- # Memoize ngram counts
65
- @ngrams[n] ||= count_ngrams(n)
66
- end
67
-
68
- def all_ngrams
69
- @ngrams
70
- end
71
-
72
- def count_ngrams(n=1)
73
- {}.tap do |tally|
74
- each_ngram(n) do |ngram|
75
- tally[ngram] ||= 0
76
- tally[ngram] += 1
77
- end
78
- end
51
+ def content_size
52
+ content ? content.size : nil
79
53
  end
80
54
 
81
55
  def calculate_simhash
@@ -0,0 +1,38 @@
1
+ module WordTree
2
+ class BookList
3
+ include Enumerable
4
+
5
+ # can be initialized from the following sources:
6
+ # - a WordTree::Disk::Library object
7
+ # - an open File object (containing a list of files or paths to books)
8
+ # - a String directory (presumed to be the library on disk)
9
+ # - a String file (containing a list of files or paths to books)
10
+ def initialize(source)
11
+ @source = source
12
+ @iterable = iterable_from_source(source)
13
+ end
14
+
15
+ def iterable_from_source(source)
16
+ case source
17
+ when WordTree::Disk::Library then
18
+ source
19
+ when File then
20
+ source.read.split("\n").tap do |file|
21
+ file.close
22
+ end
23
+ when String then
24
+ if File.directory?(source)
25
+ WordTree::Disk::Library.new(source)
26
+ elsif File.exist?(source)
27
+ IO.read(source).split("\n")
28
+ else
29
+ raise Errno::ENOENT, "Unable to find source for BookList, #{source.inspect}"
30
+ end
31
+ end
32
+ end
33
+
34
+ def each(&block)
35
+ @iterable.each(&block)
36
+ end
37
+ end
38
+ end
@@ -10,6 +10,8 @@ module WordTree
10
10
 
11
11
  attr_reader :library
12
12
 
13
+ MissingContent = Class.new(StandardError)
14
+
13
15
  # @library can be either a string (the path of the library) or a
14
16
  # WordTree::Disk::Library object
15
17
  def initialize(library)
@@ -20,7 +22,7 @@ module WordTree
20
22
  end
21
23
  end
22
24
 
23
- def find_without_ngrams(book_id)
25
+ def find(book_id)
24
26
  begin
25
27
  retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
26
28
  Book.create(book_id, retrieved.metadata, retrieved.content)
@@ -29,45 +31,17 @@ module WordTree
29
31
  end
30
32
  end
31
33
 
32
- def find(book_id)
33
- find_without_ngrams(book_id).tap do |book|
34
- (1..9).each do |n|
35
- path = library.path_to(book_id, :ngrams, :n => n)
36
- if File.exist?(path)
37
- File.open(path) do |f|
38
- hash = JSON.load(f)
39
- book.set_ngrams(n, hash)
40
- end
41
- end
42
- end
43
- end
44
- end
45
-
46
34
  def each(file_suffix_re=/\.(md|txt)$/, &block)
47
- library.each(file_suffix_re) do |path|
35
+ library.each_with_id(file_suffix_re) do |path, id|
48
36
  retrieved = Preamble.load(path, :external_encoding => "utf-8")
49
37
  yield Book.new(retrieved.metadata.merge("content" => retrieved.content))
50
38
  end
51
39
  end
52
40
 
53
- def save_without_ngrams(book)
54
- library.mkdir(book.id)
55
- Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
56
- end
57
-
58
- def save_ngrams(book)
59
- book.all_ngrams.each_pair do |n, hash|
60
- path = library.path_to(book.id, :ngrams, :n => n)
61
- File.open(path, "w") do |file|
62
- file.write hash.to_json
63
- end
64
- end
65
- end
66
-
67
41
  def save(book)
68
- save_without_ngrams(book).tap do
69
- save_ngrams(book)
70
- end
42
+ library.mkdir(book.id)
43
+ raise MissingContent, "book #{book.id} is missing content" unless book.content
44
+ Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
71
45
  end
72
46
 
73
47
  def archive_org_get(*book_ids, &block)
@@ -92,6 +66,7 @@ module WordTree
92
66
  if failure
93
67
  #TODO: logging
94
68
  $stderr.puts "Unable to download from archive.org: #{failure}"
69
+ raise failure
95
70
  else
96
71
  book = Book.create(metadata["archive_org_id"], metadata, content)
97
72
  save(book)
@@ -45,7 +45,7 @@ module WordTree
45
45
  end
46
46
 
47
47
  # Breadth-first search of the directory structure, operating on each book
48
- def each(file_suffix_re=/\.(md|txt)$/, &block)
48
+ def each_with_id(file_suffix_re=/\.(md|txt)$/, &block)
49
49
  Find.find(@root) do |path|
50
50
  if FileTest.directory?(path)
51
51
  if File.basename(path)[0] == ?.
@@ -60,6 +60,10 @@ module WordTree
60
60
  end
61
61
  end
62
62
 
63
+ def each(&block)
64
+ each_with_id{ |path, id| yield path }
65
+ end
66
+
63
67
  end
64
68
  end
65
69
  end
@@ -0,0 +1,12 @@
1
+ module WordTree
2
+ class Ngrams
3
+ def initialize
4
+ @trie = Trie.new
5
+ end
6
+
7
+ def inc(ngram)
8
+ value = @trie.get(ngram) || 0
9
+ @trie.set(ngram, value + 1)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,37 @@
1
+ require 'strscan'
2
+ require_relative "../../ext/wordtree"
3
+
4
+ module WordTree
5
+ module Text
6
+ def self.split_near(text, split_index)
7
+ if split_index >= text.size
8
+ return [text, ""]
9
+ else
10
+ index = split_index
11
+ while index >= 0
12
+ if text[index] == ' '
13
+ return [text[0...index], text[(index+1)..-1]]
14
+ end
15
+ index -= 1
16
+ end
17
+ return [text[0...split_index], text[split_index..-1]]
18
+ end
19
+ end
20
+
21
+ # Remove punctuation an non-alphabetical characters from a text, and return
22
+ # a cleaned-up version wrapped at +wrap+ characters per line.
23
+ def self.word_wrap(input, wrap=120)
24
+ output_line = String.new
25
+ wrapped_output = String.new
26
+ begin
27
+ output_line, remainder = split_near(input, wrap)
28
+ wrapped_output << output_line + "\n"
29
+ output = remainder
30
+ end while remainder.size > wrap
31
+ wrapped_output << remainder + "\n" unless remainder.empty?
32
+
33
+ return wrapped_output
34
+ end
35
+
36
+ end
37
+ end
@@ -1,3 +1,3 @@
1
- module Wordtree
2
- VERSION = "0.3.1"
1
+ module WordTree
2
+ VERSION = "0.4.0"
3
3
  end