wordtree 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/Makefile +239 -0
- data/ext/extconf.rb +5 -0
- data/ext/wordtree.cc +125 -0
- data/lib/wordtree/archdown.rb +2 -4
- data/lib/wordtree/book.rb +10 -36
- data/lib/wordtree/book_list.rb +38 -0
- data/lib/wordtree/disk/librarian.rb +8 -33
- data/lib/wordtree/disk/library.rb +5 -1
- data/lib/wordtree/ngrams.rb +12 -0
- data/lib/wordtree/text.rb +37 -0
- data/lib/wordtree/version.rb +2 -2
- data/spec/wordtree/book_spec.rb +26 -44
- data/spec/wordtree/disk/librarian_spec.rb +0 -44
- data/spec/wordtree/text_spec.rb +81 -0
- data/wordtree.gemspec +17 -15
- metadata +68 -93
- data/lib/wordtree/text_utils.rb +0 -107
- data/spec/wordtree/text_utils_spec.rb +0 -89
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
|
4
|
+
data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
|
7
|
+
data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
|
data/ext/Makefile
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
ECHO1 = $(V:1=@:)
|
9
|
+
ECHO = $(ECHO1:0=@echo)
|
10
|
+
|
11
|
+
#### Start of system configuration section. ####
|
12
|
+
|
13
|
+
srcdir = .
|
14
|
+
topdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0
|
15
|
+
hdrdir = $(topdir)
|
16
|
+
arch_hdrdir = /Users/duane/.rbenv/versions/2.1.3/include/ruby-2.1.0/x86_64-darwin13.0
|
17
|
+
PATH_SEPARATOR = :
|
18
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
19
|
+
prefix = $(DESTDIR)/Users/duane/.rbenv/versions/2.1.3
|
20
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
21
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
22
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
23
|
+
exec_prefix = $(prefix)
|
24
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
25
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
26
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
27
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
28
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
29
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
30
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
31
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
32
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
33
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
34
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
35
|
+
sitedir = $(rubylibprefix)/site_ruby
|
36
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
37
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
38
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
39
|
+
archincludedir = $(includedir)/$(arch)
|
40
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
41
|
+
archlibdir = $(libdir)/$(arch)
|
42
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
43
|
+
mandir = $(datarootdir)/man
|
44
|
+
localedir = $(datarootdir)/locale
|
45
|
+
libdir = $(exec_prefix)/lib
|
46
|
+
psdir = $(docdir)
|
47
|
+
pdfdir = $(docdir)
|
48
|
+
dvidir = $(docdir)
|
49
|
+
htmldir = $(docdir)
|
50
|
+
infodir = $(datarootdir)/info
|
51
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
52
|
+
oldincludedir = $(DESTDIR)/usr/include
|
53
|
+
includedir = $(prefix)/include
|
54
|
+
localstatedir = $(prefix)/var
|
55
|
+
sharedstatedir = $(prefix)/com
|
56
|
+
sysconfdir = $(prefix)/etc
|
57
|
+
datadir = $(datarootdir)
|
58
|
+
datarootdir = $(prefix)/share
|
59
|
+
libexecdir = $(exec_prefix)/libexec
|
60
|
+
sbindir = $(exec_prefix)/sbin
|
61
|
+
bindir = $(exec_prefix)/bin
|
62
|
+
archdir = $(rubyarchdir)
|
63
|
+
|
64
|
+
|
65
|
+
CC = clang
|
66
|
+
CXX = g++
|
67
|
+
LIBRUBY = $(LIBRUBY_A)
|
68
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
69
|
+
LIBRUBYARG_SHARED =
|
70
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation
|
71
|
+
empty =
|
72
|
+
OUTFLAG = -o $(empty)
|
73
|
+
COUTFLAG = -o $(empty)
|
74
|
+
|
75
|
+
RUBY_EXTCONF_H =
|
76
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
77
|
+
optflags = -O3 -fno-fast-math
|
78
|
+
debugflags = -ggdb3
|
79
|
+
warnflags = -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wunused-variable -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration -Wdivision-by-zero -Wextra-tokens
|
80
|
+
CCDLFLAGS = -fno-common
|
81
|
+
CFLAGS = $(CCDLFLAGS) -O3 -Wno-error=shorten-64-to-32 -pipe $(ARCH_FLAG)
|
82
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
83
|
+
DEFS =
|
84
|
+
CPPFLAGS = -I/Users/duane/.rbenv/versions/2.1.3/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
|
85
|
+
CXXFLAGS = $(CCDLFLAGS) $(cxxflags) $(ARCH_FLAG)
|
86
|
+
ldflags = -L. -L/Users/duane/.rbenv/versions/2.1.3/lib -fstack-protector -L/usr/local/lib
|
87
|
+
dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress
|
88
|
+
ARCH_FLAG =
|
89
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
90
|
+
LDSHARED = $(CC) -dynamic -bundle
|
91
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
92
|
+
AR = ar
|
93
|
+
EXEEXT =
|
94
|
+
|
95
|
+
RUBY_INSTALL_NAME = ruby
|
96
|
+
RUBY_SO_NAME = ruby
|
97
|
+
RUBYW_INSTALL_NAME =
|
98
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
99
|
+
RUBYW_BASE_NAME = rubyw
|
100
|
+
RUBY_BASE_NAME = ruby
|
101
|
+
|
102
|
+
arch = x86_64-darwin13.0
|
103
|
+
sitearch = $(arch)
|
104
|
+
ruby_version = 2.1.0
|
105
|
+
ruby = $(bindir)/ruby
|
106
|
+
RUBY = $(ruby)
|
107
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
108
|
+
|
109
|
+
RM = rm -f
|
110
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
111
|
+
RMDIRS = rmdir -p
|
112
|
+
MAKEDIRS = mkdir -p
|
113
|
+
INSTALL = /usr/bin/install -c
|
114
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
115
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
116
|
+
COPY = cp
|
117
|
+
TOUCH = exit >
|
118
|
+
|
119
|
+
#### End of system configuration section. ####
|
120
|
+
|
121
|
+
preload =
|
122
|
+
|
123
|
+
libpath = . $(libdir)
|
124
|
+
LIBPATH = -L. -L$(libdir)
|
125
|
+
DEFFILE =
|
126
|
+
|
127
|
+
CLEANFILES = mkmf.log
|
128
|
+
DISTCLEANFILES =
|
129
|
+
DISTCLEANDIRS =
|
130
|
+
|
131
|
+
extout =
|
132
|
+
extout_prefix =
|
133
|
+
target_prefix =
|
134
|
+
LOCAL_LIBS =
|
135
|
+
LIBS = -lpthread -lgmp -ldl -lobjc
|
136
|
+
ORIG_SRCS = wordtree.cc
|
137
|
+
SRCS = $(ORIG_SRCS)
|
138
|
+
OBJS = wordtree.o
|
139
|
+
HDRS =
|
140
|
+
TARGET = wordtree
|
141
|
+
TARGET_NAME = wordtree
|
142
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
143
|
+
DLLIB = $(TARGET).bundle
|
144
|
+
EXTSTATIC =
|
145
|
+
STATIC_LIB =
|
146
|
+
|
147
|
+
TIMESTAMP_DIR = .
|
148
|
+
BINDIR = $(bindir)
|
149
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
150
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
151
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
152
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
153
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
154
|
+
|
155
|
+
TARGET_SO = $(DLLIB)
|
156
|
+
CLEANLIBS = $(TARGET).bundle
|
157
|
+
CLEANOBJS = *.o *.bak
|
158
|
+
|
159
|
+
all: $(DLLIB)
|
160
|
+
static: $(STATIC_LIB)
|
161
|
+
.PHONY: all install static install-so install-rb
|
162
|
+
.PHONY: clean clean-so clean-static clean-rb
|
163
|
+
|
164
|
+
clean-static::
|
165
|
+
clean-rb-default::
|
166
|
+
clean-rb::
|
167
|
+
clean-so::
|
168
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
169
|
+
-$(Q)$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
170
|
+
|
171
|
+
distclean-rb-default::
|
172
|
+
distclean-rb::
|
173
|
+
distclean-so::
|
174
|
+
distclean-static::
|
175
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
176
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
177
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
178
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
179
|
+
|
180
|
+
realclean: distclean
|
181
|
+
install: install-so install-rb
|
182
|
+
|
183
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
|
184
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
185
|
+
clean-static::
|
186
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
187
|
+
install-rb: pre-install-rb install-rb-default
|
188
|
+
install-rb-default: pre-install-rb-default
|
189
|
+
pre-install-rb: Makefile
|
190
|
+
pre-install-rb-default: Makefile
|
191
|
+
pre-install-rb-default:
|
192
|
+
$(ECHO) installing default wordtree libraries
|
193
|
+
$(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
|
194
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
195
|
+
$(Q) $(TOUCH) $@
|
196
|
+
|
197
|
+
site-install: site-install-so site-install-rb
|
198
|
+
site-install-so: install-so
|
199
|
+
site-install-rb: install-rb
|
200
|
+
|
201
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
|
202
|
+
|
203
|
+
.cc.o:
|
204
|
+
$(ECHO) compiling $(<)
|
205
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
206
|
+
|
207
|
+
.mm.o:
|
208
|
+
$(ECHO) compiling $(<)
|
209
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
210
|
+
|
211
|
+
.cxx.o:
|
212
|
+
$(ECHO) compiling $(<)
|
213
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
214
|
+
|
215
|
+
.cpp.o:
|
216
|
+
$(ECHO) compiling $(<)
|
217
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
218
|
+
|
219
|
+
.C.o:
|
220
|
+
$(ECHO) compiling $(<)
|
221
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
222
|
+
|
223
|
+
.c.o:
|
224
|
+
$(ECHO) compiling $(<)
|
225
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
226
|
+
|
227
|
+
.m.o:
|
228
|
+
$(ECHO) compiling $(<)
|
229
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
230
|
+
|
231
|
+
$(DLLIB): $(OBJS) Makefile
|
232
|
+
$(ECHO) linking shared-object $(DLLIB)
|
233
|
+
-$(Q)$(RM) $(@)
|
234
|
+
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
235
|
+
$(Q) $(POSTLINK)
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/extconf.rb
ADDED
data/ext/wordtree.cc
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <string.h>
|
5
|
+
#include <assert.h>
|
6
|
+
|
7
|
+
// for rubinius
|
8
|
+
#ifndef rb_enc_fast_mbclen
|
9
|
+
# define rb_enc_fast_mbclen rb_enc_mbclen
|
10
|
+
#endif
|
11
|
+
|
12
|
+
static rb_encoding* u8_enc;
|
13
|
+
static rb_encoding* bin_enc;
|
14
|
+
|
15
|
+
/** Transforms text such as the following:
|
16
|
+
*
|
17
|
+
* And behold, I said, "This is no good!"
|
18
|
+
* What shall ye say unto these people, there-
|
19
|
+
* fore?
|
20
|
+
*
|
21
|
+
* Into a cleaned up single line of text, like the following:
|
22
|
+
*
|
23
|
+
* and behold i said this is no good.what shall ye say unto these people therefore.
|
24
|
+
*
|
25
|
+
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
26
|
+
*/
|
27
|
+
size_t text_clean_cstr(char* text)
|
28
|
+
{
|
29
|
+
if (*text == '\0') return 0;
|
30
|
+
|
31
|
+
char* read;
|
32
|
+
char* write = text;
|
33
|
+
uint8_t join_lines = false,
|
34
|
+
just_added_space = true, // prevent prefix spaces
|
35
|
+
just_added_period = false;
|
36
|
+
for (read=text; *read; read++) {
|
37
|
+
char c = *read;
|
38
|
+
if (c >= 'A' && c <= 'Z') {
|
39
|
+
// Change upper case to lowercase
|
40
|
+
c += 32;
|
41
|
+
} else if (c == '\n') {
|
42
|
+
// Change newlines to spaces (i.e. both count as whitespace)
|
43
|
+
c = ' ';
|
44
|
+
} else if (c == '?' || c == '!') {
|
45
|
+
// Change exclamation, question marks to periods (i.e. sentence boundaries)
|
46
|
+
c = '.';
|
47
|
+
}
|
48
|
+
|
49
|
+
if (c == '-') {
|
50
|
+
join_lines = true;
|
51
|
+
} else if (join_lines && c == ' ') {
|
52
|
+
// ignore whitespace after a dash (i.e. including newlines, which is the
|
53
|
+
// most common case because words that are broken by syllables are dashed)
|
54
|
+
} else if (c == '.' && !just_added_period) {
|
55
|
+
// erase space before period
|
56
|
+
if (just_added_space) write--;
|
57
|
+
*write++ = '.';
|
58
|
+
just_added_period = true;
|
59
|
+
just_added_space = false;
|
60
|
+
join_lines = false;
|
61
|
+
} else if (c == ' ' && !just_added_space && !just_added_period) {
|
62
|
+
*write++ = ' ';
|
63
|
+
just_added_space = true;
|
64
|
+
just_added_period = false;
|
65
|
+
} else if (c >= 'a' && c <= 'z') {
|
66
|
+
*write++ = c;
|
67
|
+
just_added_space = false;
|
68
|
+
just_added_period = false;
|
69
|
+
join_lines = false;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
// erase space at end of text
|
73
|
+
if (just_added_space) write--;
|
74
|
+
|
75
|
+
// Return the new length of the string
|
76
|
+
return (size_t)(write - text);
|
77
|
+
}
|
78
|
+
|
79
|
+
static VALUE text_common_trigrams(VALUE self, VALUE text) {
|
80
|
+
char* ptext = RSTRING_PTR(text);
|
81
|
+
long len = RSTRING_LEN(text);
|
82
|
+
|
83
|
+
if (len < 3) return INT2NUM(0);
|
84
|
+
|
85
|
+
/* 28 most common English trigrams, all squished together */
|
86
|
+
char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
|
87
|
+
|
88
|
+
char* ptr = ptext;
|
89
|
+
char* tail = ptext + len;
|
90
|
+
int i = 0, common_matched = 0;
|
91
|
+
while (ptr < tail) {
|
92
|
+
for (i = 0; i < sizeof(common_trigrams); i += 3) {
|
93
|
+
if (memcmp(ptr, common_trigrams + i, 3) == 0) {
|
94
|
+
common_matched++;
|
95
|
+
break;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
ptr++;
|
99
|
+
}
|
100
|
+
|
101
|
+
return INT2NUM(common_matched);
|
102
|
+
}
|
103
|
+
|
104
|
+
static VALUE text_clean(VALUE self, VALUE text) {
|
105
|
+
rb_str_modify(text);
|
106
|
+
|
107
|
+
char* ctext = StringValueCStr(text);
|
108
|
+
size_t new_length = text_clean_cstr(ctext);
|
109
|
+
|
110
|
+
rb_str_set_len(text, (long)new_length);
|
111
|
+
|
112
|
+
return text;
|
113
|
+
}
|
114
|
+
|
115
|
+
extern "C"
|
116
|
+
void Init_wordtree() {
|
117
|
+
VALUE rb_mWordTree = rb_define_module("WordTree");
|
118
|
+
VALUE rb_mText = rb_define_module_under(rb_mWordTree, "Text");
|
119
|
+
|
120
|
+
u8_enc = rb_utf8_encoding();
|
121
|
+
bin_enc = rb_ascii8bit_encoding();
|
122
|
+
|
123
|
+
rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
|
124
|
+
rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
|
125
|
+
}
|
data/lib/wordtree/archdown.rb
CHANGED
@@ -23,10 +23,8 @@ module WordTree
|
|
23
23
|
|
24
24
|
def content_for(archivist_book)
|
25
25
|
[archivist_book.download, nil]
|
26
|
-
rescue Archivist::Model::Document::UnsupportedFormat => e
|
27
|
-
[nil, e
|
28
|
-
rescue StandardError => e
|
29
|
-
[nil, e.to_s]
|
26
|
+
rescue StandardError, Archivist::Model::Document::UnsupportedFormat => e
|
27
|
+
[nil, e]
|
30
28
|
end
|
31
29
|
|
32
30
|
def download_all(search_terms, &each_book)
|
data/lib/wordtree/book.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'virtus'
|
2
2
|
require 'simhash'
|
3
|
+
require 'set'
|
3
4
|
|
4
|
-
require 'wordtree/
|
5
|
+
require 'wordtree/text'
|
5
6
|
|
6
7
|
module WordTree
|
7
8
|
class Book
|
@@ -18,12 +19,12 @@ module WordTree
|
|
18
19
|
attribute :size_bytes, Integer, :default => :content_size
|
19
20
|
# A simhash (locality-sensitive hash) of the content
|
20
21
|
attribute :simhash, Integer
|
22
|
+
attribute :ngrams_counted, Set
|
21
23
|
|
22
24
|
attribute :content, String
|
23
25
|
|
24
26
|
def initialize(*args)
|
25
27
|
super
|
26
|
-
@ngrams = {}
|
27
28
|
end
|
28
29
|
|
29
30
|
def self.create(id, metadata, content)
|
@@ -38,44 +39,17 @@ module WordTree
|
|
38
39
|
attributes.select{ |k,v| !v.nil? && k != :content }
|
39
40
|
end
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
@content_clean_wrap = wrap
|
45
|
-
@content_clean = TextUtils.clean_text(content, wrap)
|
46
|
-
end
|
47
|
-
@content_clean
|
42
|
+
# Modify and clean content in-place (slightly faster)
|
43
|
+
def content_clean!
|
44
|
+
WordTree::Text.clean(content)
|
48
45
|
end
|
49
46
|
|
50
|
-
def
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def each_ngram(n=1, &block)
|
55
|
-
TextUtils.each_ngram(content_clean, n, &block)
|
56
|
-
end
|
57
|
-
|
58
|
-
def set_ngrams(n, lookup)
|
59
|
-
raise ArgumentError, "must be a Hash" unless lookup.is_a?(Hash)
|
60
|
-
@ngrams[n] = lookup
|
47
|
+
def content_clean
|
48
|
+
@content_clean ||= WordTree::Text.clean(content.dup)
|
61
49
|
end
|
62
50
|
|
63
|
-
def
|
64
|
-
|
65
|
-
@ngrams[n] ||= count_ngrams(n)
|
66
|
-
end
|
67
|
-
|
68
|
-
def all_ngrams
|
69
|
-
@ngrams
|
70
|
-
end
|
71
|
-
|
72
|
-
def count_ngrams(n=1)
|
73
|
-
{}.tap do |tally|
|
74
|
-
each_ngram(n) do |ngram|
|
75
|
-
tally[ngram] ||= 0
|
76
|
-
tally[ngram] += 1
|
77
|
-
end
|
78
|
-
end
|
51
|
+
def content_size
|
52
|
+
content ? content.size : nil
|
79
53
|
end
|
80
54
|
|
81
55
|
def calculate_simhash
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WordTree
|
2
|
+
class BookList
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
# can be initialized from the following sources:
|
6
|
+
# - a WordTree::Disk::Library object
|
7
|
+
# - an open File object (containing a list of files or paths to books)
|
8
|
+
# - a String directory (presumed to be the library on disk)
|
9
|
+
# - a String file (containing a list of files or paths to books)
|
10
|
+
def initialize(source)
|
11
|
+
@source = source
|
12
|
+
@iterable = iterable_from_source(source)
|
13
|
+
end
|
14
|
+
|
15
|
+
def iterable_from_source(source)
|
16
|
+
case source
|
17
|
+
when WordTree::Disk::Library then
|
18
|
+
source
|
19
|
+
when File then
|
20
|
+
source.read.split("\n").tap do |file|
|
21
|
+
file.close
|
22
|
+
end
|
23
|
+
when String then
|
24
|
+
if File.directory?(source)
|
25
|
+
WordTree::Disk::Library.new(source)
|
26
|
+
elsif File.exist?(source)
|
27
|
+
IO.read(source).split("\n")
|
28
|
+
else
|
29
|
+
raise Errno::ENOENT, "Unable to find source for BookList, #{source.inspect}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def each(&block)
|
35
|
+
@iterable.each(&block)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -10,6 +10,8 @@ module WordTree
|
|
10
10
|
|
11
11
|
attr_reader :library
|
12
12
|
|
13
|
+
MissingContent = Class.new(StandardError)
|
14
|
+
|
13
15
|
# @library can be either a string (the path of the library) or a
|
14
16
|
# WordTree::Disk::Library object
|
15
17
|
def initialize(library)
|
@@ -20,7 +22,7 @@ module WordTree
|
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
23
|
-
def
|
25
|
+
def find(book_id)
|
24
26
|
begin
|
25
27
|
retrieved = Preamble.load(library.path_to(book_id), :external_encoding => "utf-8")
|
26
28
|
Book.create(book_id, retrieved.metadata, retrieved.content)
|
@@ -29,45 +31,17 @@ module WordTree
|
|
29
31
|
end
|
30
32
|
end
|
31
33
|
|
32
|
-
def find(book_id)
|
33
|
-
find_without_ngrams(book_id).tap do |book|
|
34
|
-
(1..9).each do |n|
|
35
|
-
path = library.path_to(book_id, :ngrams, :n => n)
|
36
|
-
if File.exist?(path)
|
37
|
-
File.open(path) do |f|
|
38
|
-
hash = JSON.load(f)
|
39
|
-
book.set_ngrams(n, hash)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
34
|
def each(file_suffix_re=/\.(md|txt)$/, &block)
|
47
|
-
library.
|
35
|
+
library.each_with_id(file_suffix_re) do |path, id|
|
48
36
|
retrieved = Preamble.load(path, :external_encoding => "utf-8")
|
49
37
|
yield Book.new(retrieved.metadata.merge("content" => retrieved.content))
|
50
38
|
end
|
51
39
|
end
|
52
40
|
|
53
|
-
def save_without_ngrams(book)
|
54
|
-
library.mkdir(book.id)
|
55
|
-
Preamble.new(book.metadata, book.content || "").save(library.path_to(book.id))
|
56
|
-
end
|
57
|
-
|
58
|
-
def save_ngrams(book)
|
59
|
-
book.all_ngrams.each_pair do |n, hash|
|
60
|
-
path = library.path_to(book.id, :ngrams, :n => n)
|
61
|
-
File.open(path, "w") do |file|
|
62
|
-
file.write hash.to_json
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
41
|
def save(book)
|
68
|
-
|
69
|
-
|
70
|
-
|
42
|
+
library.mkdir(book.id)
|
43
|
+
raise MissingContent, "book #{book.id} is missing content" unless book.content
|
44
|
+
Preamble.new(book.metadata, book.content).save(library.path_to(book.id))
|
71
45
|
end
|
72
46
|
|
73
47
|
def archive_org_get(*book_ids, &block)
|
@@ -92,6 +66,7 @@ module WordTree
|
|
92
66
|
if failure
|
93
67
|
#TODO: logging
|
94
68
|
$stderr.puts "Unable to download from archive.org: #{failure}"
|
69
|
+
raise failure
|
95
70
|
else
|
96
71
|
book = Book.create(metadata["archive_org_id"], metadata, content)
|
97
72
|
save(book)
|
@@ -45,7 +45,7 @@ module WordTree
|
|
45
45
|
end
|
46
46
|
|
47
47
|
# Breadth-first search of the directory structure, operating on each book
|
48
|
-
def
|
48
|
+
def each_with_id(file_suffix_re=/\.(md|txt)$/, &block)
|
49
49
|
Find.find(@root) do |path|
|
50
50
|
if FileTest.directory?(path)
|
51
51
|
if File.basename(path)[0] == ?.
|
@@ -60,6 +60,10 @@ module WordTree
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
+
def each(&block)
|
64
|
+
each_with_id{ |path, id| yield path }
|
65
|
+
end
|
66
|
+
|
63
67
|
end
|
64
68
|
end
|
65
69
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require_relative "../../ext/wordtree"
|
3
|
+
|
4
|
+
module WordTree
|
5
|
+
module Text
|
6
|
+
def self.split_near(text, split_index)
|
7
|
+
if split_index >= text.size
|
8
|
+
return [text, ""]
|
9
|
+
else
|
10
|
+
index = split_index
|
11
|
+
while index >= 0
|
12
|
+
if text[index] == ' '
|
13
|
+
return [text[0...index], text[(index+1)..-1]]
|
14
|
+
end
|
15
|
+
index -= 1
|
16
|
+
end
|
17
|
+
return [text[0...split_index], text[split_index..-1]]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove punctuation an non-alphabetical characters from a text, and return
|
22
|
+
# a cleaned-up version wrapped at +wrap+ characters per line.
|
23
|
+
def self.word_wrap(input, wrap=120)
|
24
|
+
output_line = String.new
|
25
|
+
wrapped_output = String.new
|
26
|
+
begin
|
27
|
+
output_line, remainder = split_near(input, wrap)
|
28
|
+
wrapped_output << output_line + "\n"
|
29
|
+
output = remainder
|
30
|
+
end while remainder.size > wrap
|
31
|
+
wrapped_output << remainder + "\n" unless remainder.empty?
|
32
|
+
|
33
|
+
return wrapped_output
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
data/lib/wordtree/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module
|
2
|
-
VERSION = "0.
|
1
|
+
module WordTree
|
2
|
+
VERSION = "0.4.0"
|
3
3
|
end
|