wordtree 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/Makefile +239 -0
- data/ext/extconf.rb +5 -0
- data/ext/wordtree.cc +125 -0
- data/lib/wordtree/archdown.rb +2 -4
- data/lib/wordtree/book.rb +10 -36
- data/lib/wordtree/book_list.rb +38 -0
- data/lib/wordtree/disk/librarian.rb +8 -33
- data/lib/wordtree/disk/library.rb +5 -1
- data/lib/wordtree/ngrams.rb +12 -0
- data/lib/wordtree/text.rb +37 -0
- data/lib/wordtree/version.rb +2 -2
- data/spec/wordtree/book_spec.rb +26 -44
- data/spec/wordtree/disk/librarian_spec.rb +0 -44
- data/spec/wordtree/text_spec.rb +81 -0
- data/wordtree.gemspec +17 -15
- metadata +68 -93
- data/lib/wordtree/text_utils.rb +0 -107
- data/spec/wordtree/text_utils_spec.rb +0 -89
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
|
4
|
+
data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
|
7
|
+
data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
|
data/ext/Makefile
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
ECHO1 = $(V:1=@:)
|
9
|
+
ECHO = $(ECHO1:0=@echo)
|
10
|
+
|
11
|
+
#### Start of system configuration section. ####
|
12
|
+
|
13
|
+
srcdir = .
|
14
|
+
topdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0
|
15
|
+
hdrdir = $(topdir)
|
16
|
+
arch_hdrdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0/x86_64-darwin13.0
|
17
|
+
PATH_SEPARATOR = :
|
18
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
19
|
+
prefix = $(DESTDIR)/Users/duane/.rbenv/versions/2.1.3
|
20
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
21
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
22
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
23
|
+
exec_prefix = $(prefix)
|
24
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
25
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
26
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
27
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
28
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
29
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
30
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
31
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
32
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
33
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
34
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
35
|
+
sitedir = $(rubylibprefix)/site_ruby
|
36
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
37
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
38
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
39
|
+
archincludedir = $(includedir)/$(arch)
|
40
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
41
|
+
archlibdir = $(libdir)/$(arch)
|
42
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
43
|
+
mandir = $(datarootdir)/man
|
44
|
+
localedir = $(datarootdir)/locale
|
45
|
+
libdir = $(exec_prefix)/lib
|
46
|
+
psdir = $(docdir)
|
47
|
+
pdfdir = $(docdir)
|
48
|
+
dvidir = $(docdir)
|
49
|
+
htmldir = $(docdir)
|
50
|
+
infodir = $(datarootdir)/info
|
51
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
52
|
+
oldincludedir = $(DESTDIR)/usr/include
|
53
|
+
includedir = $(prefix)/include
|
54
|
+
localstatedir = $(prefix)/var
|
55
|
+
sharedstatedir = $(prefix)/com
|
56
|
+
sysconfdir = $(prefix)/etc
|
57
|
+
datadir = $(datarootdir)
|
58
|
+
datarootdir = $(prefix)/share
|
59
|
+
libexecdir = $(exec_prefix)/libexec
|
60
|
+
sbindir = $(exec_prefix)/sbin
|
61
|
+
bindir = $(exec_prefix)/bin
|
62
|
+
archdir = $(rubyarchdir)
|
63
|
+
|
64
|
+
|
65
|
+
CC = clang
|
66
|
+
CXX = g++
|
67
|
+
LIBRUBY = $(LIBRUBY_A)
|
68
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
69
|
+
LIBRUBYARG_SHARED =
|
70
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation
|
71
|
+
empty =
|
72
|
+
OUTFLAG = -o $(empty)
|
73
|
+
COUTFLAG = -o $(empty)
|
74
|
+
|
75
|
+
RUBY_EXTCONF_H =
|
76
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
77
|
+
optflags = -O3 -fno-fast-math
|
78
|
+
debugflags = -ggdb3
|
79
|
+
warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wunused-variable -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration -Wdivision-by-zero -Wextra-tokens
|
80
|
+
CCDLFLAGS = -fno-common
|
81
|
+
CFLAGS = $(CCDLFLAGS) -O3 -Wno-error=shorten-64-to-32 -pipe $(ARCH_FLAG)
|
82
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
83
|
+
DEFS =
|
84
|
+
CPPFLAGS = -I/Users/duane/.rbenv/versions/2.1.3/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
|
85
|
+
CXXFLAGS = $(CCDLFLAGS) $(cxxflags) $(ARCH_FLAG)
|
86
|
+
ldflags = -L. -L/Users/duane/.rbenv/versions/2.1.3/lib -fstack-protector -L/usr/local/lib
|
87
|
+
dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress
|
88
|
+
ARCH_FLAG =
|
89
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
90
|
+
LDSHARED = $(CC) -dynamic -bundle
|
91
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
92
|
+
AR = ar
|
93
|
+
EXEEXT =
|
94
|
+
|
95
|
+
RUBY_INSTALL_NAME = ruby
|
96
|
+
RUBY_SO_NAME = ruby
|
97
|
+
RUBYW_INSTALL_NAME =
|
98
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
99
|
+
RUBYW_BASE_NAME = rubyw
|
100
|
+
RUBY_BASE_NAME = ruby
|
101
|
+
|
102
|
+
arch = x86_64-darwin13.0
|
103
|
+
sitearch = $(arch)
|
104
|
+
ruby_version = 2.1.0
|
105
|
+
ruby = $(bindir)/ruby
|
106
|
+
RUBY = $(ruby)
|
107
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
108
|
+
|
109
|
+
RM = rm -f
|
110
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
111
|
+
RMDIRS = rmdir -p
|
112
|
+
MAKEDIRS = mkdir -p
|
113
|
+
INSTALL = /usr/bin/install -c
|
114
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
115
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
116
|
+
COPY = cp
|
117
|
+
TOUCH = exit >
|
118
|
+
|
119
|
+
#### End of system configuration section. ####
|
120
|
+
|
121
|
+
preload =
|
122
|
+
|
123
|
+
libpath = . $(libdir)
|
124
|
+
LIBPATH = -L. -L$(libdir)
|
125
|
+
DEFFILE =
|
126
|
+
|
127
|
+
CLEANFILES = mkmf.log
|
128
|
+
DISTCLEANFILES =
|
129
|
+
DISTCLEANDIRS =
|
130
|
+
|
131
|
+
extout =
|
132
|
+
extout_prefix =
|
133
|
+
target_prefix =
|
134
|
+
LOCAL_LIBS =
|
135
|
+
LIBS = -lpthread -lgmp -ldl -lobjc
|
136
|
+
ORIG_SRCS = wordtree.cc
|
137
|
+
SRCS = $(ORIG_SRCS)
|
138
|
+
OBJS = wordtree.o
|
139
|
+
HDRS =
|
140
|
+
TARGET = wordtree
|
141
|
+
TARGET_NAME = wordtree
|
142
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
143
|
+
DLLIB = $(TARGET).bundle
|
144
|
+
EXTSTATIC =
|
145
|
+
STATIC_LIB =
|
146
|
+
|
147
|
+
TIMESTAMP_DIR = .
|
148
|
+
BINDIR = $(bindir)
|
149
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
150
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
151
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
152
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
153
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
154
|
+
|
155
|
+
TARGET_SO = $(DLLIB)
|
156
|
+
CLEANLIBS = $(TARGET).bundle
|
157
|
+
CLEANOBJS = *.o *.bak
|
158
|
+
|
159
|
+
all: $(DLLIB)
|
160
|
+
static: $(STATIC_LIB)
|
161
|
+
.PHONY: all install static install-so install-rb
|
162
|
+
.PHONY: clean clean-so clean-static clean-rb
|
163
|
+
|
164
|
+
clean-static::
|
165
|
+
clean-rb-default::
|
166
|
+
clean-rb::
|
167
|
+
clean-so::
|
168
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
169
|
+
-$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
170
|
+
|
171
|
+
distclean-rb-default::
|
172
|
+
distclean-rb::
|
173
|
+
distclean-so::
|
174
|
+
distclean-static::
|
175
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
176
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
177
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
178
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
179
|
+
|
180
|
+
realclean: distclean
|
181
|
+
install: install-so install-rb
|
182
|
+
|
183
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
|
184
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
185
|
+
clean-static::
|
186
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
187
|
+
install-rb: pre-install-rb install-rb-default
|
188
|
+
install-rb-default: pre-install-rb-default
|
189
|
+
pre-install-rb: Makefile
|
190
|
+
pre-install-rb-default: Makefile
|
191
|
+
pre-install-rb-default:
|
192
|
+
$(ECHO) installing default wordtree libraries
|
193
|
+
$(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
|
194
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
195
|
+
$(Q) $(TOUCH) $@
|
196
|
+
|
197
|
+
site-install: site-install-so site-install-rb
|
198
|
+
site-install-so: install-so
|
199
|
+
site-install-rb: install-rb
|
200
|
+
|
201
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
|
202
|
+
|
203
|
+
.cc.o:
|
204
|
+
$(ECHO) compiling $(<)
|
205
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
206
|
+
|
207
|
+
.mm.o:
|
208
|
+
$(ECHO) compiling $(<)
|
209
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
210
|
+
|
211
|
+
.cxx.o:
|
212
|
+
$(ECHO) compiling $(<)
|
213
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
214
|
+
|
215
|
+
.cpp.o:
|
216
|
+
$(ECHO) compiling $(<)
|
217
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
218
|
+
|
219
|
+
.C.o:
|
220
|
+
$(ECHO) compiling $(<)
|
221
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
222
|
+
|
223
|
+
.c.o:
|
224
|
+
$(ECHO) compiling $(<)
|
225
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
226
|
+
|
227
|
+
.m.o:
|
228
|
+
$(ECHO) compiling $(<)
|
229
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
230
|
+
|
231
|
+
$(DLLIB): $(OBJS) Makefile
|
232
|
+
$(ECHO) linking shared-object $(DLLIB)
|
233
|
+
-$(Q)$(RM) $(@)
|
234
|
+
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
235
|
+
$(Q) $(POSTLINK)
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/extconf.rb
ADDED
data/ext/wordtree.cc
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <string.h>
|
5
|
+
#include <assert.h>
|
6
|
+
|
7
|
+
// for rubinius
|
8
|
+
#ifndef rb_enc_fast_mbclen
|
9
|
+
# define rb_enc_fast_mbclen rb_enc_mbclen
|
10
|
+
#endif
|
11
|
+
|
12
|
+
static rb_encoding* u8_enc;
|
13
|
+
static rb_encoding* bin_enc;
|
14
|
+
|
15
|
+
/** Transforms text such as the following:
|
16
|
+
*
|
17
|
+
* And behold, I said, "This is no good!"
|
18
|
+
* What shall ye say unto these people, there-
|
19
|
+
* fore?
|
20
|
+
*
|
21
|
+
* Into a cleaned up single line of text, like the following:
|
22
|
+
*
|
23
|
+
* and behold i said this is no good.what shall ye say unto these people therefore.
|
24
|
+
*
|
25
|
+
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
26
|
+
*/
|
27
|
+
size_t text_clean_cstr(char* text)
|
28
|
+
{
|
29
|
+
if (*text == '\0') return 0;
|
30
|
+
|
31
|
+
char* read;
|
32
|
+
char* write = text;
|
33
|
+
uint8_t join_lines = false,
|
34
|
+
just_added_space = true, // prevent prefix spaces
|
35
|
+
just_added_period = false;
|
36
|
+
for (read=text; *read; read++) {
|
37
|
+
char c = *read;
|
38
|
+
if (c >= 'A' && c <= 'Z') {
|
39
|
+
// Change upper case to lowercase
|
40
|
+
c += 32;
|
41
|
+
} else if (c == '\n') {
|
42
|
+
// Change newlines to spaces (i.e. both count as whitespace)
|
43
|
+
c = ' ';
|
44
|
+
} else if (c == '?' || c == '!') {
|
45
|
+
// Change exclamation, question marks to periods (i.e. sentence boundaries)
|
46
|
+
c = '.';
|
47
|
+
}
|
48
|
+
|
49
|
+
if (c == '-') {
|
50
|
+
join_lines = true;
|
51
|
+
} else if (join_lines && c == ' ') {
|
52
|
+
// ignore whitespace after a dash (i.e. including newlines, which is the
|
53
|
+
// most common case because words that are broken by syllables are dashed)
|
54
|
+
} else if (c == '.' && !just_added_period) {
|
55
|
+
// erase space before period
|
56
|
+
if (just_added_space) write--;
|
57
|
+
*write++ = '.';
|
58
|
+
just_added_period = true;
|
59
|
+
just_added_space = false;
|
60
|
+
join_lines = false;
|
61
|
+
} else if (c == ' ' && !just_added_space && !just_added_period) {
|
62
|
+
*write++ = ' ';
|
63
|
+
just_added_space = true;
|
64
|
+
just_added_period = false;
|
65
|
+
} else if (c >= 'a' && c <= 'z') {
|
66
|
+
*write++ = c;
|
67
|
+
just_added_space = false;
|
68
|
+
just_added_period = false;
|
69
|
+
join_lines = false;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
// erase space at end of text
|
73
|
+
if (just_added_space) write--;
|
74
|
+
|
75
|
+
// Return the new length of the string
|
76
|
+
return (size_t)(write - text);
|
77
|
+
}
|
78
|
+
|
79
|
+
static VALUE text_common_trigrams(VALUE self, VALUE text) {
|
80
|
+
char* ptext = RSTRING_PTR(text);
|
81
|
+
long len = RSTRING_LEN(text);
|
82
|
+
|
83
|
+
if (len < 3) return INT2NUM(0);
|
84
|
+
|
85
|
+
/* 28 most common English trigrams, all squished together */
|
86
|
+
char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
|
87
|
+
|
88
|
+
char* ptr = ptext;
|
89
|
+
char* tail = ptext + len;
|
90
|
+
int i = 0, common_matched = 0;
|
91
|
+
while (ptr < tail) {
|
92
|
+
for (i = 0; i < sizeof(common_trigrams); i += 3) {
|
93
|
+
if (memcmp(ptr, common_trigrams + i, 3) == 0) {
|
94
|
+
common_matched++;
|
95
|
+
break;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
ptr++;
|
99
|
+
}
|
100
|
+
|
101
|
+
return INT2NUM(common_matched);
|
102
|
+
}
|
103
|
+
|
104
|
+
static VALUE text_clean(VALUE self, VALUE text) {
|
105
|
+
rb_str_modify(text);
|
106
|
+
|
107
|
+
char* ctext = StringValueCStr(text);
|
108
|
+
size_t new_length = text_clean_cstr(ctext);
|
109
|
+
|
110
|
+
rb_str_set_len(text, (long)new_length);
|
111
|
+
|
112
|
+
return text;
|
113
|
+
}
|
114
|
+
|
115
|
+
extern "C"
|
116
|
+
void Init_wordtree() {
|
117
|
+
VALUE rb_mWordTree = rb_define_module("WordTree");
|
118
|
+
VALUE rb_mText = rb_define_module_under(rb_mWordTree, "Text");
|
119
|
+
|
120
|
+
u8_enc = rb_utf8_encoding();
|
121
|
+
bin_enc = rb_ascii8bit_encoding();
|
122
|
+
|
123
|
+
rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
|
124
|
+
rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
|
125
|
+
}
|
data/lib/wordtree/archdown.rb
CHANGED
@@ -23,10 +23,8 @@ module WordTree
|
|
23
23
|
|
24
24
|
def content_for(archivist_book)
|
25
25
|
[archivist_book.download, nil]
|
26
|
-
rescue Archivist::Model::Document::UnsupportedFormat => e
|
27
|
-
[nil, e
|
28
|
-
rescue StandardError => e
|
29
|
-
[nil, e.to_s]
|
26
|
+
rescue StandardError, Archivist::Model::Document::UnsupportedFormat => e
|
27
|
+
[nil, e]
|
30
28
|
end
|
31
29
|
|
32
30
|
def download_all(search_terms, &each_book)
|
data/lib/wordtree/book.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'virtus'
|
2
2
|
require 'simhash'
|
3
|
+
require 'set'
|
3
4
|
|
4
|
-
require 'wordtree/
|
5
|
+
require 'wordtree/text'
|
5
6
|
|
6
7
|
module WordTree
|
7
8
|
class Book
|
@@ -18,12 +19,12 @@ module WordTree
|
|
18
19
|
attribute :size_bytes, Integer, :default => :content_size
|
19
20
|
# A simhash (locality-sensitive hash) of the content
|
20
21
|
attribute :simhash, Integer
|
22
|
+
attribute :ngrams_counted, Set
|
21
23
|
|
22
24
|
attribute :content, String
|
23
25
|
|
24
26
|
def initialize(*args)
|
25
27
|
super
|
26
|
-
@ngrams = {}
|
27
28
|
end
|
28
29
|
|
29
30
|
def self.create(id, metadata, content)
|
@@ -38,44 +39,17 @@ module WordTree
|
|
38
39
|
attributes.select{ |k,v| !v.nil? && k != :content }
|
39
40
|
end
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
@content_clean_wrap = wrap
|
45
|
-
@content_clean = TextUtils.clean_text(content, wrap)
|
46
|
-
end
|
47
|
-
@content_clean
|
42
|
+
# Modify and clean content in-place (slightly faster)
|
43
|
+
def content_clean!
|
44
|
+
WordTree::Text.clean(content)
|
48
45
|
end
|
49
46
|
|
50
|
-
def
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def each_ngram(n=1, &block)
|
55
|
-
TextUtils.each_ngram(content_clean, n, &block)
|
56
|
-
end
|
57
|
-
|
58
|
-
def set_ngrams(n, lookup)
|
59
|
-
raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
|
60
|
-
@ngrams[n] = lookup
|
47
|
+
def content_clean
|
48
|
+
@content_clean ||= WordTree::Text.clean(content.dup)
|
61
49
|
end
|
62
50
|
|
63
|
-
def
|
64
|
-
|
65
|
-
@ngrams[n] ||= count_ngrams(n)
|
66
|
-
end
|
67
|
-
|
68
|
-
def all_ngrams
|
69
|
-
@ngrams
|
70
|
-
end
|
71
|
-
|
72
|
-
def count_ngrams(n=1)
|
73
|
-
{}.tap do |tally|
|
74
|
-
each_ngram(n) do |ngram|
|
75
|
-
tally[ngram] ||= 0
|
76
|
-
tally[ngram] += 1
|
77
|
-
end
|
78
|
-
end
|
51
|
+
def content_size
|
52
|
+
content ? content.size : nil
|
79
53
|
end
|
80
54
|
|
81
55
|
def calculate_simhash
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WordTree
|
2
|
+
class BookList
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
# can be initialized from the following sources:
|
6
|
+
# - a WordTree::Disk::Library object
|
7
|
+
# - an open File object (containing a list of files or paths to books)
|
8
|
+
# - a String directory (presumed to be the library on disk)
|
9
|
+
# - a String file (containing a list of files or paths to books)
|
10
|
+
def initialize(source)
|
11
|
+
@source = source
|
12
|
+
@iterable = iterable_from_source(source)
|
13
|
+
end
|
14
|
+
|
15
|
+
def iterable_from_source(source)
|
16
|
+
case source
|
17
|
+
when WordTree::Disk::Library then
|
18
|
+
source
|
19
|
+
when File then
|
20
|
+
source.read.split("\n").tap do |file|
|
21
|
+
file.close
|
22
|
+
end
|
23
|
+
when String then
|
24
|
+
if File.directory?(source)
|
25
|
+
WordTree::Disk::Library.new(source)
|
26
|
+
elsif File.exist?(source)
|
27
|
+
IO.read(source).split("\n")
|
28
|
+
else
|
29
|
+
raise Errno::ENOENT, "Unable to find source for BookList, #{source.inspect}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def each(&block)
|
35
|
+
@iterable.each(&block)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -10,6 +10,8 @@ module WordTree
|
|
10
10
|
|
11
11
|
attr_reader :library
|
12
12
|
|
13
|
+
MissingContent = Class.new(StandardError)
|
14
|
+
|
13
15
|
# @library can be either a string (the path of the library) or a
|
14
16
|
# WordTree::Disk::Library object
|
15
17
|
def initialize(library)
|
@@ -20,7 +22,7 @@ module WordTree
|
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
23
|
-
def
|
25
|
+
def find(book_id)
|
24
26
|
begin
|
25
27
|
retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
|
26
28
|
Book.create(book_id, retrieved.metadata, retrieved.content)
|
@@ -29,45 +31,17 @@ module WordTree
|
|
29
31
|
end
|
30
32
|
end
|
31
33
|
|
32
|
-
def find(book_id)
|
33
|
-
find_without_ngrams(book_id).tap do |book|
|
34
|
-
(1..9).each do |n|
|
35
|
-
path = library.path_to(book_id, :ngrams, :n => n)
|
36
|
-
if File.exist?(path)
|
37
|
-
File.open(path) do |f|
|
38
|
-
hash = JSON.load(f)
|
39
|
-
book.set_ngrams(n, hash)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
34
|
def each(file_suffix_re=/\.(md|txt)$/, &block)
|
47
|
-
library.
|
35
|
+
library.each_with_id(file_suffix_re) do |path, id|
|
48
36
|
retrieved = Preamble.load(path, :external_encoding => "utf-8")
|
49
37
|
yield Book.new(retrieved.metadata.merge("content" => retrieved.content))
|
50
38
|
end
|
51
39
|
end
|
52
40
|
|
53
|
-
def save_without_ngrams(book)
|
54
|
-
library.mkdir(book.id)
|
55
|
-
Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
|
56
|
-
end
|
57
|
-
|
58
|
-
def save_ngrams(book)
|
59
|
-
book.all_ngrams.each_pair do |n, hash|
|
60
|
-
path = library.path_to(book.id, :ngrams, :n => n)
|
61
|
-
File.open(path, "w") do |file|
|
62
|
-
file.write hash.to_json
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
41
|
def save(book)
|
68
|
-
|
69
|
-
|
70
|
-
|
42
|
+
library.mkdir(book.id)
|
43
|
+
raise MissingContent, "book #{book.id} is missing content" unless book.content
|
44
|
+
Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
|
71
45
|
end
|
72
46
|
|
73
47
|
def archive_org_get(*book_ids, &block)
|
@@ -92,6 +66,7 @@ module WordTree
|
|
92
66
|
if failure
|
93
67
|
#TODO: logging
|
94
68
|
$stderr.puts "Unable to download from archive.org: #{failure}"
|
69
|
+
raise failure
|
95
70
|
else
|
96
71
|
book = Book.create(metadata["archive_org_id"], metadata, content)
|
97
72
|
save(book)
|
@@ -45,7 +45,7 @@ module WordTree
|
|
45
45
|
end
|
46
46
|
|
47
47
|
# Breadth-first search of the directory structure, operating on each book
|
48
|
-
def
|
48
|
+
def each_with_id(file_suffix_re=/\.(md|txt)$/, &block)
|
49
49
|
Find.find(@root) do |path|
|
50
50
|
if FileTest.directory?(path)
|
51
51
|
if File.basename(path)[0] == ?.
|
@@ -60,6 +60,10 @@ module WordTree
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
+
def each(&block)
|
64
|
+
each_with_id{ |path, id| yield path }
|
65
|
+
end
|
66
|
+
|
63
67
|
end
|
64
68
|
end
|
65
69
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require_relative "../../ext/wordtree"
|
3
|
+
|
4
|
+
module WordTree
|
5
|
+
module Text
|
6
|
+
def self.split_near(text, split_index)
|
7
|
+
if split_index >= text.size
|
8
|
+
return [text, ""]
|
9
|
+
else
|
10
|
+
index = split_index
|
11
|
+
while index >= 0
|
12
|
+
if text[index] == ' '
|
13
|
+
return [text[0...index], text[(index+1)..-1]]
|
14
|
+
end
|
15
|
+
index -= 1
|
16
|
+
end
|
17
|
+
return [text[0...split_index], text[split_index..-1]]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove punctuation an non-alphabetical characters from a text, and return
|
22
|
+
# a cleaned-up version wrapped at +wrap+ characters per line.
|
23
|
+
def self.word_wrap(input, wrap=120)
|
24
|
+
output_line = String.new
|
25
|
+
wrapped_output = String.new
|
26
|
+
begin
|
27
|
+
output_line, remainder = split_near(input, wrap)
|
28
|
+
wrapped_output << output_line + "\n"
|
29
|
+
output = remainder
|
30
|
+
end while remainder.size > wrap
|
31
|
+
wrapped_output << remainder + "\n" unless remainder.empty?
|
32
|
+
|
33
|
+
return wrapped_output
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
data/lib/wordtree/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module
|
2
|
-
VERSION = "0.
|
1
|
+
module WordTree
|
2
|
+
VERSION = "0.4.0"
|
3
3
|
end
|