wordtree 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
4
+ data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
5
+ SHA512:
6
+ metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
7
+ data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
data/ext/Makefile ADDED
@@ -0,0 +1,239 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ ECHO1 = $(V:1=@:)
9
+ ECHO = $(ECHO1:0=@echo)
10
+
11
+ #### Start of system configuration section. ####
12
+
13
+ srcdir = .
14
+ topdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0
15
+ hdrdir = $(topdir)
16
+ arch_hdrdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0/x86_64-darwin13.0
17
+ PATH_SEPARATOR = :
18
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
19
+ prefix = $(DESTDIR)/Users/duane/.rbenv/versions/2.1.3
20
+ rubysitearchprefix = $(rubylibprefix)/$(sitearch)
21
+ rubyarchprefix = $(rubylibprefix)/$(arch)
22
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
23
+ exec_prefix = $(prefix)
24
+ vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
25
+ sitearchhdrdir = $(sitehdrdir)/$(sitearch)
26
+ rubyarchhdrdir = $(rubyhdrdir)/$(arch)
27
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
28
+ sitehdrdir = $(rubyhdrdir)/site_ruby
29
+ rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
30
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
31
+ vendorlibdir = $(vendordir)/$(ruby_version)
32
+ vendordir = $(rubylibprefix)/vendor_ruby
33
+ sitearchdir = $(sitelibdir)/$(sitearch)
34
+ sitelibdir = $(sitedir)/$(ruby_version)
35
+ sitedir = $(rubylibprefix)/site_ruby
36
+ rubyarchdir = $(rubylibdir)/$(arch)
37
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
38
+ sitearchincludedir = $(includedir)/$(sitearch)
39
+ archincludedir = $(includedir)/$(arch)
40
+ sitearchlibdir = $(libdir)/$(sitearch)
41
+ archlibdir = $(libdir)/$(arch)
42
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
43
+ mandir = $(datarootdir)/man
44
+ localedir = $(datarootdir)/locale
45
+ libdir = $(exec_prefix)/lib
46
+ psdir = $(docdir)
47
+ pdfdir = $(docdir)
48
+ dvidir = $(docdir)
49
+ htmldir = $(docdir)
50
+ infodir = $(datarootdir)/info
51
+ docdir = $(datarootdir)/doc/$(PACKAGE)
52
+ oldincludedir = $(DESTDIR)/usr/include
53
+ includedir = $(prefix)/include
54
+ localstatedir = $(prefix)/var
55
+ sharedstatedir = $(prefix)/com
56
+ sysconfdir = $(prefix)/etc
57
+ datadir = $(datarootdir)
58
+ datarootdir = $(prefix)/share
59
+ libexecdir = $(exec_prefix)/libexec
60
+ sbindir = $(exec_prefix)/sbin
61
+ bindir = $(exec_prefix)/bin
62
+ archdir = $(rubyarchdir)
63
+
64
+
65
+ CC = clang
66
+ CXX = g++
67
+ LIBRUBY = $(LIBRUBY_A)
68
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
69
+ LIBRUBYARG_SHARED =
70
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation
71
+ empty =
72
+ OUTFLAG = -o $(empty)
73
+ COUTFLAG = -o $(empty)
74
+
75
+ RUBY_EXTCONF_H =
76
+ cflags = $(optflags) $(debugflags) $(warnflags)
77
+ optflags = -O3 -fno-fast-math
78
+ debugflags = -ggdb3
79
+ warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wunused-variable -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration -Wdivision-by-zero -Wextra-tokens
80
+ CCDLFLAGS = -fno-common
81
+ CFLAGS = $(CCDLFLAGS) -O3 -Wno-error=shorten-64-to-32 -pipe $(ARCH_FLAG)
82
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
83
+ DEFS =
84
+ CPPFLAGS = -I/Users/duane/.rbenv/versions/2.1.3/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
85
+ CXXFLAGS = $(CCDLFLAGS) $(cxxflags) $(ARCH_FLAG)
86
+ ldflags = -L. -L/Users/duane/.rbenv/versions/2.1.3/lib -fstack-protector -L/usr/local/lib
87
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress
88
+ ARCH_FLAG =
89
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
90
+ LDSHARED = $(CC) -dynamic -bundle
91
+ LDSHAREDXX = $(CXX) -dynamic -bundle
92
+ AR = ar
93
+ EXEEXT =
94
+
95
+ RUBY_INSTALL_NAME = ruby
96
+ RUBY_SO_NAME = ruby
97
+ RUBYW_INSTALL_NAME =
98
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
99
+ RUBYW_BASE_NAME = rubyw
100
+ RUBY_BASE_NAME = ruby
101
+
102
+ arch = x86_64-darwin13.0
103
+ sitearch = $(arch)
104
+ ruby_version = 2.1.0
105
+ ruby = $(bindir)/ruby
106
+ RUBY = $(ruby)
107
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
108
+
109
+ RM = rm -f
110
+ RM_RF = $(RUBY) -run -e rm -- -rf
111
+ RMDIRS = rmdir -p
112
+ MAKEDIRS = mkdir -p
113
+ INSTALL = /usr/bin/install -c
114
+ INSTALL_PROG = $(INSTALL) -m 0755
115
+ INSTALL_DATA = $(INSTALL) -m 644
116
+ COPY = cp
117
+ TOUCH = exit >
118
+
119
+ #### End of system configuration section. ####
120
+
121
+ preload =
122
+
123
+ libpath = . $(libdir)
124
+ LIBPATH = -L. -L$(libdir)
125
+ DEFFILE =
126
+
127
+ CLEANFILES = mkmf.log
128
+ DISTCLEANFILES =
129
+ DISTCLEANDIRS =
130
+
131
+ extout =
132
+ extout_prefix =
133
+ target_prefix =
134
+ LOCAL_LIBS =
135
+ LIBS = -lpthread -lgmp -ldl -lobjc
136
+ ORIG_SRCS = wordtree.cc
137
+ SRCS = $(ORIG_SRCS)
138
+ OBJS = wordtree.o
139
+ HDRS =
140
+ TARGET = wordtree
141
+ TARGET_NAME = wordtree
142
+ TARGET_ENTRY = Init_$(TARGET_NAME)
143
+ DLLIB = $(TARGET).bundle
144
+ EXTSTATIC =
145
+ STATIC_LIB =
146
+
147
+ TIMESTAMP_DIR = .
148
+ BINDIR = $(bindir)
149
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
150
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
151
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
152
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
153
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
154
+
155
+ TARGET_SO = $(DLLIB)
156
+ CLEANLIBS = $(TARGET).bundle
157
+ CLEANOBJS = *.o *.bak
158
+
159
+ all: $(DLLIB)
160
+ static: $(STATIC_LIB)
161
+ .PHONY: all install static install-so install-rb
162
+ .PHONY: clean clean-so clean-static clean-rb
163
+
164
+ clean-static::
165
+ clean-rb-default::
166
+ clean-rb::
167
+ clean-so::
168
+ clean: clean-so clean-static clean-rb-default clean-rb
169
+ -$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
170
+
171
+ distclean-rb-default::
172
+ distclean-rb::
173
+ distclean-so::
174
+ distclean-static::
175
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
176
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
177
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
178
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
179
+
180
+ realclean: distclean
181
+ install: install-so install-rb
182
+
183
+ install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
184
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
185
+ clean-static::
186
+ -$(Q)$(RM) $(STATIC_LIB)
187
+ install-rb: pre-install-rb install-rb-default
188
+ install-rb-default: pre-install-rb-default
189
+ pre-install-rb: Makefile
190
+ pre-install-rb-default: Makefile
191
+ pre-install-rb-default:
192
+ $(ECHO) installing default wordtree libraries
193
+ $(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
194
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
195
+ $(Q) $(TOUCH) $@
196
+
197
+ site-install: site-install-so site-install-rb
198
+ site-install-so: install-so
199
+ site-install-rb: install-rb
200
+
201
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
202
+
203
+ .cc.o:
204
+ $(ECHO) compiling $(<)
205
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
206
+
207
+ .mm.o:
208
+ $(ECHO) compiling $(<)
209
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
210
+
211
+ .cxx.o:
212
+ $(ECHO) compiling $(<)
213
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
214
+
215
+ .cpp.o:
216
+ $(ECHO) compiling $(<)
217
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
218
+
219
+ .C.o:
220
+ $(ECHO) compiling $(<)
221
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
222
+
223
+ .c.o:
224
+ $(ECHO) compiling $(<)
225
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
226
+
227
+ .m.o:
228
+ $(ECHO) compiling $(<)
229
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
230
+
231
+ $(DLLIB): $(OBJS) Makefile
232
+ $(ECHO) linking shared-object $(DLLIB)
233
+ -$(Q)$(RM) $(@)
234
+ $(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
235
+ $(Q) $(POSTLINK)
236
+
237
+
238
+
239
+ $(OBJS): $(HDRS) $(ruby_headers)
data/ext/extconf.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+
3
+ dir_config("wordtree")
4
+
5
+ create_makefile("wordtree")
data/ext/wordtree.cc ADDED
@@ -0,0 +1,125 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include <stdio.h>
4
+ #include <string.h>
5
+ #include <assert.h>
6
+
7
+ // for rubinius
8
+ #ifndef rb_enc_fast_mbclen
9
+ # define rb_enc_fast_mbclen rb_enc_mbclen
10
+ #endif
11
+
12
+ static rb_encoding* u8_enc;
13
+ static rb_encoding* bin_enc;
14
+
15
+ /** Transforms text such as the following:
16
+ *
17
+ * And behold, I said, "This is no good!"
18
+ * What shall ye say unto these people, there-
19
+ * fore?
20
+ *
21
+ * Into a cleaned up single line of text, like the following:
22
+ *
23
+ * and behold i said this is no good.what shall ye say unto these people therefore.
24
+ *
25
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
26
+ */
27
+ size_t text_clean_cstr(char* text)
28
+ {
29
+ if (*text == '\0') return 0;
30
+
31
+ char* read;
32
+ char* write = text;
33
+ uint8_t join_lines = false,
34
+ just_added_space = true, // prevent prefix spaces
35
+ just_added_period = false;
36
+ for (read=text; *read; read++) {
37
+ char c = *read;
38
+ if (c >= 'A' && c <= 'Z') {
39
+ // Change upper case to lowercase
40
+ c += 32;
41
+ } else if (c == '\n') {
42
+ // Change newlines to spaces (i.e. both count as whitespace)
43
+ c = ' ';
44
+ } else if (c == '?' || c == '!') {
45
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
46
+ c = '.';
47
+ }
48
+
49
+ if (c == '-') {
50
+ join_lines = true;
51
+ } else if (join_lines && c == ' ') {
52
+ // ignore whitespace after a dash (i.e. including newlines, which is the
53
+ // most common case because words that are broken by syllables are dashed)
54
+ } else if (c == '.' && !just_added_period) {
55
+ // erase space before period
56
+ if (just_added_space) write--;
57
+ *write++ = '.';
58
+ just_added_period = true;
59
+ just_added_space = false;
60
+ join_lines = false;
61
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
62
+ *write++ = ' ';
63
+ just_added_space = true;
64
+ just_added_period = false;
65
+ } else if (c >= 'a' && c <= 'z') {
66
+ *write++ = c;
67
+ just_added_space = false;
68
+ just_added_period = false;
69
+ join_lines = false;
70
+ }
71
+ }
72
+ // erase space at end of text
73
+ if (just_added_space) write--;
74
+
75
+ // Return the new length of the string
76
+ return (size_t)(write - text);
77
+ }
78
+
79
+ static VALUE text_common_trigrams(VALUE self, VALUE text) {
80
+ char* ptext = RSTRING_PTR(text);
81
+ long len = RSTRING_LEN(text);
82
+
83
+ if (len < 3) return INT2NUM(0);
84
+
85
+ /* 28 most common English trigrams, all squished together */
86
+ char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
87
+
88
+ char* ptr = ptext;
89
+ char* tail = ptext + len;
90
+ int i = 0, common_matched = 0;
91
+ while (ptr < tail) {
92
+ for (i = 0; i < sizeof(common_trigrams); i += 3) {
93
+ if (memcmp(ptr, common_trigrams + i, 3) == 0) {
94
+ common_matched++;
95
+ break;
96
+ }
97
+ }
98
+ ptr++;
99
+ }
100
+
101
+ return INT2NUM(common_matched);
102
+ }
103
+
104
+ static VALUE text_clean(VALUE self, VALUE text) {
105
+ rb_str_modify(text);
106
+
107
+ char* ctext = StringValueCStr(text);
108
+ size_t new_length = text_clean_cstr(ctext);
109
+
110
+ rb_str_set_len(text, (long)new_length);
111
+
112
+ return text;
113
+ }
114
+
115
+ extern "C"
116
+ void Init_wordtree() {
117
+ VALUE rb_mWordTree = rb_define_module("WordTree");
118
+ VALUE rb_mText = rb_define_module_under(rb_mWordTree, "Text");
119
+
120
+ u8_enc = rb_utf8_encoding();
121
+ bin_enc = rb_ascii8bit_encoding();
122
+
123
+ rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
124
+ rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
125
+ }
@@ -23,10 +23,8 @@ module WordTree
23
23
 
24
24
  def content_for(archivist_book)
25
25
  [archivist_book.download, nil]
26
- rescue Archivist::Model::Document::UnsupportedFormat => e
27
- [nil, e.to_s]
28
- rescue StandardError => e
29
- [nil, e.to_s]
26
+ rescue StandardError, Archivist::Model::Document::UnsupportedFormat => e
27
+ [nil, e]
30
28
  end
31
29
 
32
30
  def download_all(search_terms, &each_book)
data/lib/wordtree/book.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'virtus'
2
2
  require 'simhash'
3
+ require 'set'
3
4
 
4
- require 'wordtree/text_utils'
5
+ require 'wordtree/text'
5
6
 
6
7
  module WordTree
7
8
  class Book
@@ -18,12 +19,12 @@ module WordTree
18
19
  attribute :size_bytes, Integer, :default => :content_size
19
20
  # A simhash (locality-sensitive hash) of the content
20
21
  attribute :simhash, Integer
22
+ attribute :ngrams_counted, Set
21
23
 
22
24
  attribute :content, String
23
25
 
24
26
  def initialize(*args)
25
27
  super
26
- @ngrams = {}
27
28
  end
28
29
 
29
30
  def self.create(id, metadata, content)
@@ -38,44 +39,17 @@ module WordTree
38
39
  attributes.select{ |k,v| !v.nil? && k != :content }
39
40
  end
40
41
 
41
- def content_clean(wrap=120)
42
- if @content_clean_wrap != wrap
43
- # Memoize content_clean (using last wrap size)
44
- @content_clean_wrap = wrap
45
- @content_clean = TextUtils.clean_text(content, wrap)
46
- end
47
- @content_clean
42
+ # Modify and clean content in-place (slightly faster)
43
+ def content_clean!
44
+ WordTree::Text.clean(content)
48
45
  end
49
46
 
50
- def content_size
51
- content ? content.size : nil
52
- end
53
-
54
- def each_ngram(n=1, &block)
55
- TextUtils.each_ngram(content_clean, n, &block)
56
- end
57
-
58
- def set_ngrams(n, lookup)
59
- raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
60
- @ngrams[n] = lookup
47
+ def content_clean
48
+ @content_clean ||= WordTree::Text.clean(content.dup)
61
49
  end
62
50
 
63
- def ngrams(n=1)
64
- # Memoize ngram counts
65
- @ngrams[n] ||= count_ngrams(n)
66
- end
67
-
68
- def all_ngrams
69
- @ngrams
70
- end
71
-
72
- def count_ngrams(n=1)
73
- {}.tap do |tally|
74
- each_ngram(n) do |ngram|
75
- tally[ngram] ||= 0
76
- tally[ngram] += 1
77
- end
78
- end
51
+ def content_size
52
+ content ? content.size : nil
79
53
  end
80
54
 
81
55
  def calculate_simhash
@@ -0,0 +1,38 @@
1
+ module WordTree
2
+ class BookList
3
+ include Enumerable
4
+
5
+ # can be initialized from the following sources:
6
+ # - a WordTree::Disk::Library object
7
+ # - an open File object (containing a list of files or paths to books)
8
+ # - a String directory (presumed to be the library on disk)
9
+ # - a String file (containing a list of files or paths to books)
10
+ def initialize(source)
11
+ @source = source
12
+ @iterable = iterable_from_source(source)
13
+ end
14
+
15
+ def iterable_from_source(source)
16
+ case source
17
+ when WordTree::Disk::Library then
18
+ source
19
+ when File then
20
+ source.read.split("\n").tap do |file|
21
+ file.close
22
+ end
23
+ when String then
24
+ if File.directory?(source)
25
+ WordTree::Disk::Library.new(source)
26
+ elsif File.exist?(source)
27
+ IO.read(source).split("\n")
28
+ else
29
+ raise Errno::ENOENT, "Unable to find source for BookList, #{source.inspect}"
30
+ end
31
+ end
32
+ end
33
+
34
+ def each(&block)
35
+ @iterable.each(&block)
36
+ end
37
+ end
38
+ end
@@ -10,6 +10,8 @@ module WordTree
10
10
 
11
11
  attr_reader :library
12
12
 
13
+ MissingContent = Class.new(StandardError)
14
+
13
15
  # @library can be either a string (the path of the library) or a
14
16
  # WordTree::Disk::Library object
15
17
  def initialize(library)
@@ -20,7 +22,7 @@ module WordTree
20
22
  end
21
23
  end
22
24
 
23
- def find_without_ngrams(book_id)
25
+ def find(book_id)
24
26
  begin
25
27
  retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
26
28
  Book.create(book_id, retrieved.metadata, retrieved.content)
@@ -29,45 +31,17 @@ module WordTree
29
31
  end
30
32
  end
31
33
 
32
- def find(book_id)
33
- find_without_ngrams(book_id).tap do |book|
34
- (1..9).each do |n|
35
- path = library.path_to(book_id, :ngrams, :n => n)
36
- if File.exist?(path)
37
- File.open(path) do |f|
38
- hash = JSON.load(f)
39
- book.set_ngrams(n, hash)
40
- end
41
- end
42
- end
43
- end
44
- end
45
-
46
34
  def each(file_suffix_re=/\.(md|txt)$/, &block)
47
- library.each(file_suffix_re) do |path|
35
+ library.each_with_id(file_suffix_re) do |path, id|
48
36
  retrieved = Preamble.load(path, :external_encoding => "utf-8")
49
37
  yield Book.new(retrieved.metadata.merge("content" => retrieved.content))
50
38
  end
51
39
  end
52
40
 
53
- def save_without_ngrams(book)
54
- library.mkdir(book.id)
55
- Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
56
- end
57
-
58
- def save_ngrams(book)
59
- book.all_ngrams.each_pair do |n, hash|
60
- path = library.path_to(book.id, :ngrams, :n => n)
61
- File.open(path, "w") do |file|
62
- file.write hash.to_json
63
- end
64
- end
65
- end
66
-
67
41
  def save(book)
68
- save_without_ngrams(book).tap do
69
- save_ngrams(book)
70
- end
42
+ library.mkdir(book.id)
43
+ raise MissingContent, "book #{book.id} is missing content" unless book.content
44
+ Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
71
45
  end
72
46
 
73
47
  def archive_org_get(*book_ids, &block)
@@ -92,6 +66,7 @@ module WordTree
92
66
  if failure
93
67
  #TODO: logging
94
68
  $stderr.puts "Unable to download from archive.org: #{failure}"
69
+ raise failure
95
70
  else
96
71
  book = Book.create(metadata["archive_org_id"], metadata, content)
97
72
  save(book)
@@ -45,7 +45,7 @@ module WordTree
45
45
  end
46
46
 
47
47
  # Breadth-first search of the directory structure, operating on each book
48
- def each(file_suffix_re=/\.(md|txt)$/, &block)
48
+ def each_with_id(file_suffix_re=/\.(md|txt)$/, &block)
49
49
  Find.find(@root) do |path|
50
50
  if FileTest.directory?(path)
51
51
  if File.basename(path)[0] == ?.
@@ -60,6 +60,10 @@ module WordTree
60
60
  end
61
61
  end
62
62
 
63
+ def each(&block)
64
+ each_with_id{ |path, id| yield path }
65
+ end
66
+
63
67
  end
64
68
  end
65
69
  end
@@ -0,0 +1,12 @@
1
+ module WordTree
2
+ class Ngrams
3
+ def initialize
4
+ @trie = Trie.new
5
+ end
6
+
7
+ def inc(ngram)
8
+ value = @trie.get(ngram) || 0
9
+ @trie.set(ngram, value + 1)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,37 @@
1
+ require 'strscan'
2
+ require_relative "../../ext/wordtree"
3
+
4
+ module WordTree
5
+ module Text
6
+ def self.split_near(text, split_index)
7
+ if split_index >= text.size
8
+ return [text, ""]
9
+ else
10
+ index = split_index
11
+ while index >= 0
12
+ if text[index] == ' '
13
+ return [text[0...index], text[(index+1)..-1]]
14
+ end
15
+ index -= 1
16
+ end
17
+ return [text[0...split_index], text[split_index..-1]]
18
+ end
19
+ end
20
+
21
+ # Remove punctuation an non-alphabetical characters from a text, and return
22
+ # a cleaned-up version wrapped at +wrap+ characters per line.
23
+ def self.word_wrap(input, wrap=120)
24
+ output_line = String.new
25
+ wrapped_output = String.new
26
+ begin
27
+ output_line, remainder = split_near(input, wrap)
28
+ wrapped_output << output_line + "\n"
29
+ output = remainder
30
+ end while remainder.size > wrap
31
+ wrapped_output << remainder + "\n" unless remainder.empty?
32
+
33
+ return wrapped_output
34
+ end
35
+
36
+ end
37
+ end
@@ -1,3 +1,3 @@
1
- module Wordtree
2
- VERSION = "0.3.1"
1
+ module WordTree
2
+ VERSION = "0.4.0"
3
3
  end