weft-qda 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/lib/weft.rb +21 -0
  2. data/lib/weft/WEFT-VERSION-STRING.rb +1 -0
  3. data/lib/weft/application.rb +130 -0
  4. data/lib/weft/backend.rb +39 -0
  5. data/lib/weft/backend/marshal.rb +26 -0
  6. data/lib/weft/backend/mysql.rb +267 -0
  7. data/lib/weft/backend/n6.rb +366 -0
  8. data/lib/weft/backend/sqlite.rb +633 -0
  9. data/lib/weft/backend/sqlite/category_tree.rb +104 -0
  10. data/lib/weft/backend/sqlite/schema.rb +152 -0
  11. data/lib/weft/backend/sqlite/upgradeable.rb +55 -0
  12. data/lib/weft/category.rb +157 -0
  13. data/lib/weft/coding.rb +355 -0
  14. data/lib/weft/document.rb +118 -0
  15. data/lib/weft/filters.rb +243 -0
  16. data/lib/weft/wxgui.rb +687 -0
  17. data/lib/weft/wxgui/category.xpm +26 -0
  18. data/lib/weft/wxgui/dialogs.rb +128 -0
  19. data/lib/weft/wxgui/document.xpm +25 -0
  20. data/lib/weft/wxgui/error_handler.rb +52 -0
  21. data/lib/weft/wxgui/inspectors.rb +361 -0
  22. data/lib/weft/wxgui/inspectors/category.rb +165 -0
  23. data/lib/weft/wxgui/inspectors/codereview.rb +275 -0
  24. data/lib/weft/wxgui/inspectors/document.rb +139 -0
  25. data/lib/weft/wxgui/inspectors/imagedocument.rb +56 -0
  26. data/lib/weft/wxgui/inspectors/script.rb +35 -0
  27. data/lib/weft/wxgui/inspectors/search.rb +265 -0
  28. data/lib/weft/wxgui/inspectors/textcontrols.rb +304 -0
  29. data/lib/weft/wxgui/lang.rb +17 -0
  30. data/lib/weft/wxgui/lang/en.rb +45 -0
  31. data/lib/weft/wxgui/mondrian.xpm +44 -0
  32. data/lib/weft/wxgui/search.xpm +25 -0
  33. data/lib/weft/wxgui/sidebar.rb +498 -0
  34. data/lib/weft/wxgui/utilities.rb +148 -0
  35. data/lib/weft/wxgui/weft16.xpm +31 -0
  36. data/lib/weft/wxgui/workarea.rb +249 -0
  37. data/test/001-document.rb +196 -0
  38. data/test/002-category.rb +138 -0
  39. data/test/003-code.rb +370 -0
  40. data/test/004-application.rb +52 -0
  41. data/test/006-filters.rb +139 -0
  42. data/test/009a-backend_sqlite_basic.rb +280 -0
  43. data/test/009b-backend_sqlite_complex.rb +175 -0
  44. data/test/009c_backend_sqlite_bench.rb +81 -0
  45. data/test/010-backend_nudist.rb +5 -0
  46. data/test/all-tests.rb +1 -0
  47. data/test/manual-gui-script.txt +24 -0
  48. data/test/testdata/autocoding-test.txt +15 -0
  49. data/test/testdata/iso-8859-1.txt +5 -0
  50. data/test/testdata/sample_doc.txt +19 -0
  51. data/test/testdata/search_results.txt +1254 -0
  52. data/test/testdata/text1-dos-ascii.txt +2 -0
  53. data/test/testdata/text1-unix-utf8.txt +2 -0
  54. data/weft-qda.rb +28 -0
  55. metadata +96 -0
data/lib/weft.rb ADDED
@@ -0,0 +1,21 @@
1
+ require 'weft/filters'
2
+ require 'weft/document'
3
+ require 'weft/category'
4
+ require 'weft/backend'
5
+ require 'weft/application'
6
+
7
+ begin
8
+ require 'weft/WEFT-VERSION-STRING'
9
+ WEFT_VERSION = QDA::Version.new(WEFT_VERSION_STRING)
10
+ # rs2exed but from local CVS branch
11
+ if WEFT_VERSION == QDA::Version.default_version()
12
+ WEFT_TESTING = true
13
+ else
14
+ # release or testing version from CVS checkout
15
+ WEFT_TESTING = false
16
+ end
17
+ rescue LoadError
18
+ WEFT_VERSION = QDA::Version.default_version()
19
+ WEFT_VERSION_STRING = '[unreleased version]'
20
+ WEFT_TESTING = true
21
+ end
@@ -0,0 +1 @@
1
+ WEFT_VERSION_STRING = '0.9.6'
@@ -0,0 +1,130 @@
1
+ require 'observer'
2
+
3
+ module QDA
4
+ class Version
5
+ attr_reader :major, :minor, :release
6
+ def Version.default_version()
7
+ self.new('0.0.0')
8
+ end
9
+
10
+ def initialize(str)
11
+ if str =~ /(\d+)\.(\d+)\.(\d+)\s*/
12
+ @major, @minor, @release = $1, $2, $3
13
+ else
14
+ raise ArgumentError.new("Invalid version string #{str}")
15
+ end
16
+ end
17
+
18
+ def to_s
19
+ [@major, @minor, @release].join('.')
20
+ end
21
+
22
+ def ==(other)
23
+ if other.kind_of?(String)
24
+ other = Version.new(other)
25
+ end
26
+ self.major == other.major &&
27
+ self.minor == other.minor &&
28
+ self.release == other.release
29
+ end
30
+ end
31
+
32
+ class Application
33
+ include Observable
34
+ def initialize(observer = nil)
35
+ add_observer(observer) if observer
36
+ @dirty = false
37
+ end
38
+
39
+ # creates a completely empty new application / project, using the
40
+ # backend +backend+, to be intialized with args +args+
41
+ def Application::new_virgin(backend, args, observer = nil)
42
+ app = new()
43
+ app.extend( backend )
44
+ app.start(args)
45
+ app.install_clean()
46
+ app
47
+ end
48
+
49
+ # create some basic nodes
50
+ def set_up()
51
+ save_category( Category.new('CATEGORIES', nil) )
52
+ save_category( Category.new('SEARCHES', nil) )
53
+ save_preference( 'CreateVersion', WEFT_VERSION )
54
+ save_preference( 'CreateVersionString', WEFT_VERSION_STRING )
55
+ undirty!
56
+ end
57
+
58
+ def each_doc()
59
+ get_all_docs().each { | doc | yield doc }
60
+ end
61
+
62
+ # are we in a state where we need saving - should be overridden in subclass
63
+ def dirty?()
64
+ @dirty
65
+ end
66
+
67
+ def dirty!()
68
+ changed if not dirty?
69
+ @dirty = true
70
+ notify_observers(@dirty)
71
+ end
72
+
73
+ def undirty!()
74
+ changed if dirty?
75
+ @dirty = false
76
+ notify_observers(@dirty)
77
+ end
78
+
79
+ # is it up and running
80
+ def started?
81
+ true
82
+ end
83
+
84
+ # signal to clear up - should this be the level at which an exception
85
+ # is raised - no, should probably be at the gui level
86
+ def finish(force = false)
87
+ if ! consistent? and ! force
88
+ raise "Not ready to be saved"
89
+ end
90
+ end
91
+
92
+ def query_segment(function, *args)
93
+ case function
94
+ when "IS CODED BY"
95
+ category = get_category(args[0])
96
+ return get_text_at_category( category )
97
+ when "CONTAINS WORD"
98
+ return get_search_fragments( args[0], :wrap_both => 100 )
99
+ else
100
+ raise RuntimeError.new("Unknown function '#{function}' in query")
101
+ end
102
+ end
103
+ private :query_segment
104
+
105
+ # executes a query, which is a series of descriptions of text
106
+ # functions (eg 'CODED BY "category x"') and operators describing
107
+ # how to combine them('AND', 'NOT', 'Or')
108
+ def do_query(*query)
109
+ text_1 = query_segment(query.shift, query.shift)
110
+
111
+ # work rightwards through the query, doing various kinds of
112
+ # combination of the result sets retrieved
113
+ while op = query.shift
114
+ return text_1 if op.empty?
115
+ text_2 = query_segment(query.shift, query.shift)
116
+ if op == 'AND'
117
+ text_1.join(text_2)
118
+ elsif op == 'OR'
119
+ text_1.merge(text_2)
120
+ elsif op =~ /(AND )?NOT/
121
+ text_1.remove(text_2)
122
+ else
123
+ raise RuntimeError.new("Unknown operator '#{op}' in query")
124
+ end
125
+ end
126
+
127
+ return text_1
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,39 @@
1
+ module QDA
2
+ module Backend
3
+ autoload :MySQL, 'weft/backend/mysql'
4
+ # autoload :SQLite, 'backend/sqlite'
5
+ require 'weft/backend/sqlite'
6
+ # autoload :N6, 'backend/n6'
7
+ require 'weft/backend/n6'
8
+ autoload :RubyNative, 'weft/backend/marshal'
9
+
10
+ module Abstract
11
+ # receive arguments and make any connection required to the
12
+ # storage source
13
+ def start(args = {})
14
+ # raise "virtual"
15
+ end
16
+
17
+ # load a specific document
18
+ def get_doc(doctitle)
19
+ raise "virtual"
20
+ end
21
+
22
+ # an array of all the documents - should this include TEXT
23
+ def get_all_docs()
24
+ []
25
+ end
26
+
27
+ # all categories in a tree structure, the root nodes are returned
28
+ def get_all_categories()
29
+ [ Category.new('') ]
30
+ end
31
+
32
+ # save changes
33
+ def save()
34
+ raise "virtual"
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,26 @@
1
+ # storage backend using Ruby's Marshal module
2
+ module QDA
3
+ module Backend
4
+ module MySQL
5
+ def connect(args)
6
+ @basedir = args[:basedir]
7
+ File.open(@basedir + 'docs') { | f | @docs = Marshal.load(f) }
8
+ @docs.values.each { | doc | doc.source = self }
9
+
10
+ File.open(@basedir + 'nodes') { | f | @nodes = Marshal.load(f) }
11
+ @nodes.each { | node | node.source = self }
12
+ @root_node = @nodes.detect { | node | node.is_a?(Nudist::RootNode) }
13
+ end
14
+
15
+ def save()
16
+ @docs.values.each { | doc | doc.source = nil }
17
+ File.open(@basedir + 'docs', 'w') { | f | Marshal.dump(@docs, f) }
18
+ @docs.values.each { | doc | doc.source = self }
19
+
20
+ @nodes.each { | node | node.source = nil }
21
+ File.open(@basedir + 'nodes', 'w') { | f | Marshal.dump(@nodes, f) }
22
+ @nodes.each { | node | node.source = self }
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,267 @@
1
+ require 'dbi'
2
+
3
+ # storage backend using MYSQL-dbi module
4
+ module QDA
5
+ module Backend
6
+ module MySQL
7
+ # load up the database connection
8
+ def connect(dbh)
9
+ @dbh = dbh
10
+ end
11
+
12
+ #
13
+ def get_all_docs()
14
+ @dbh.select_all("SELECT * FROM docs") do | row |
15
+ doc = Document.new(row['doctitle'])
16
+ doc.dbid = row['docid']
17
+ end
18
+ end
19
+ # fetch the document identified by the string ident
20
+ def get_doc(ident)
21
+ doc = nil
22
+ if ident =~ /^\d+/
23
+ r = @dbh.select_one("SELECT doctitle FROM docs WHERE docid = ?",
24
+ ident)
25
+ unless r
26
+ raise "No document found matching id '#{ident}'"
27
+ end
28
+ doc = Document.new(r[0])
29
+ doc.dbid = ident
30
+ else
31
+ r = @dbh.select_one("SELECT docid FROM docs WHERE doctitle = ?", ident)
32
+ unless r
33
+ raise "No document found matching title '#{ident}'"
34
+ end
35
+ doc = Document.new(ident)
36
+ doc.dbid = r[0]
37
+ end
38
+
39
+ text = ''
40
+ @dbh.select_all("SELECT * FROM chunks
41
+ WHERE docid = ?
42
+ ORDER BY docseq", doc.dbid) do | row |
43
+ doc.append(row['chunk'], row['type'])
44
+
45
+ end
46
+ doc
47
+ end
48
+
49
+ # decide what level to code at?
50
+ # this returns a weighted search result set
51
+ def doc_search_by_category(category)
52
+ # SELECT SUM(SQRT(idx_cat.score * idx_doc.score)), idx_doc.catid
53
+ # FROM reverse_index AS idx_doc, reverse_index AS idx_cat
54
+ # WHERE idx_cat.catid = 1
55
+ # AND idx_doc.catid != idx_cat.catid
56
+ # AND idx_doc.word = idx_cat.word
57
+ # GROUP BY idx_doc.catid;
58
+ end
59
+
60
+ def save_document(doc)
61
+ if doc.dbid
62
+ @dbh.do("UPDATE docs SET doctitle = ? WHERE docid = ?",
63
+ doc.title, doc.dbid)
64
+ else
65
+ @dbh.do("INSERT INTO docs VALUES(NULL, ?)", @title)
66
+ r = @dbh.select_one("SELECT LAST_INSERT_ID()")
67
+ doc.dbid = r[0]
68
+ doc.fragments.each_with_index do | frag, i |
69
+ @dbh.do("INSERT INTO chunks VALUES(?, 0, ?, ?)",
70
+ doc.dbid, frag.text, i)
71
+ doc.dbid = r[0]
72
+ end
73
+ end
74
+ self
75
+ end
76
+
77
+ def connect(args)
78
+ @dbh = args[:dbh]
79
+ end
80
+
81
+ def get_category(catid)
82
+ r = @dbh.select_one("SELECT * FROM categories WHERE catid = ?",
83
+ catid)
84
+ return nil unless r
85
+ # raise "No category found matching id '#{cat}'" unless r
86
+ category = Category.new(r['catname'])
87
+
88
+
89
+ # not found
90
+ category.dbid = catid
91
+
92
+ @dbh.select_all("SELECT codes.docid, codes.start, codes.offset
93
+ FROM codes
94
+ WHERE catid = ? ", category.dbid) do | row |
95
+ category.code(row['docid'], row['start'].to_i, row['offset'].to_i)
96
+
97
+ end
98
+ category
99
+ end
100
+
101
+ def category_search_by_words(*words)
102
+ words = words.collect { | word | @dbh.quote(word) }
103
+ cats = []
104
+ @dbh.select_all("SELECT category.catid, category.catname, " +
105
+ "SUM(category_ridx.score) AS score " +
106
+ "FROM category_ridx, category " +
107
+ "WHERE category_ridx.catid = category.catid " +
108
+ "AND category_ridx.word IN ( ? ) " +
109
+ "GROUP BY category_ridx.catid",
110
+ words.join(',') ) do | r |
111
+
112
+ cat = Category.new(r['catname'])
113
+ cat.dbid = r['catid']
114
+ cats.push(cat)
115
+ end
116
+ cats
117
+ end
118
+
119
+ # fetches all the categories in a tree structure, starting from the right
120
+ def get_all_categories()
121
+ parents = [ { 'cat' => Category.new('root'), 'r' => 10 } ]
122
+ l = 0 # avoid reinit
123
+ r = 0 # avoid reinit
124
+ cat = nil
125
+
126
+ query = "SELECT * FROM categories ORDER BY l"
127
+ @dbh.select_all(query) do | row |
128
+ l = row['l'].to_i
129
+ r = row['r'].to_i
130
+ cat = Category.new(row['catname'])
131
+ cat.dbid = row['catid'].to_i
132
+ # if a leaf
133
+ if l + 1 == r
134
+ parents[-1]['cat'].add_child(cat)
135
+ else
136
+ parents.push( { 'cat' => cat, 'r' => r } )
137
+ end
138
+
139
+ until parents.length == 1 || r <= parents[-1]['r']
140
+ parents.pop.fetch('cat').append_to(parents[-1]['cat'])
141
+ end
142
+ end
143
+
144
+ # clean up outstanding items to be added
145
+ while parents.length > 1
146
+ parents.pop.fetch('cat').append_to(parents[-1]['cat'])
147
+ end
148
+
149
+ return parents[0]['cat']
150
+ end
151
+
152
+ def save_category(cat)
153
+ if cat.dbid
154
+ @dbh.do("DELETE FROM codes WHERE catid = ?", cat.dbid)
155
+ @dbh.do("UPDATE categories SET catname = ? WHERE catid = ?",
156
+ cat.name, cat.dbid)
157
+ else
158
+ l = nil
159
+ if cat.parent
160
+ row = @dbh.select_one("SELECT r FROM categories
161
+ WHERE catid = ? ", cat.parent.dbid)
162
+ l = row['r'].to_i
163
+ else
164
+ row = @dbh.select_one("SELECT MAX(r) r FROM categories")
165
+ l = row['r'].to_i + 1
166
+ end
167
+ puts "got l = #{l}"
168
+
169
+ @dbh.do("INSERT INTO categories VALUES(NULL, ?, '', ?, ?)",
170
+ cat.name, l, l + 1)
171
+ r = @dbh.select_one("SELECT LAST_INSERT_ID()")
172
+ cat.dbid = r[0]
173
+ @dbh.do("UPDATE categories SET l = l + 2
174
+ WHERE l > ? ", l)
175
+ @dbh.do("UPDATE categories SET r = r + 2
176
+ WHERE r >= ?
177
+ AND catid != ? ", l, cat.dbid)
178
+ end
179
+
180
+ cat.vectors.each do | docid, vecs |
181
+ vecs.each do | vec |
182
+ @dbh.do("INSERT INTO codes VALUES(?, ?, ?, ?)",
183
+ cat.dbid, docid, vec.start, vec.length )
184
+ end
185
+ end
186
+
187
+ self
188
+ end
189
+
190
+ ## WARN - below here is older code imported from a different project
191
+ ## NOT YET TESTED FOR COMPATIBILITY
192
+ def save()
193
+ sql_install()
194
+ @docs.values.each do | doc |
195
+ sql_store_document(doc)
196
+ end
197
+ i = 0
198
+ counter = proc { i += 1 }
199
+
200
+ sql_store_node(@root_node, counter)
201
+ end
202
+
203
+ def sql_install()
204
+ @dbh.do("DROP TABLE IF EXISTS docs")
205
+
206
+ # docid is just nudist's internal document keying system
207
+ @dbh.do("CREATE TABLE docs (
208
+ docid varchar(8) primary key,
209
+ doctitle varchar(255),
210
+ docmemo text,
211
+ external CHAR(1) )")
212
+
213
+ @dbh.do("DROP TABLE IF EXISTS fragments")
214
+ @dbh.do("CREATE TABLE fragments (
215
+ docid VARCHAR(8) REFERENCES doc(docid),
216
+ seq INT,
217
+ fragment text)")
218
+
219
+ @dbh.do("DROP TABLE IF EXISTS nodes")
220
+ @dbh.do("CREATE TABLE nodes (
221
+ nodeid VARCHAR(32) PRIMARY KEY,
222
+ nodepath VARCHAR(255),
223
+ nodeuid VARCHAR(255),
224
+ nodename VARCHAR(32),
225
+ nodememo TEXT,
226
+ l INT,
227
+ r INT )")
228
+
229
+ @dbh.do("DROP TABLE IF EXISTS vectors")
230
+ @dbh.do("CREATE TABLE vectors (
231
+ nodeid VARCHAR(32) REFERENCES nodes(nodeid),
232
+ docid VARCHAR(8) REFERENCES doc(docid),
233
+ start INT REFERENCES DOCFRAGMENT(seq),
234
+ length INT)")
235
+ end
236
+
237
+ def sql_store_node(node, counter)
238
+ left = counter.call()
239
+ node.children.each do | child |
240
+ sql_store_node(child, counter)
241
+ end
242
+ right = counter.call()
243
+
244
+ @dbh.do("INSERT INTO nodes VALUES(?, ?, ?, ?, ?, ?, ?)",
245
+ node.uniq_id, node.path, node.child_id, node.title,
246
+ node.memo, left, right)
247
+ node.codes.each do | doc, vectors |
248
+ vectors.each do | vector |
249
+ @dbh.do("INSERT INTO vectors VALUES (?, ?, ?, ?)",
250
+ node.uniq_id, vector.doc.doc_id,
251
+ vector.start, vector.length)
252
+ end
253
+ end
254
+ end
255
+
256
+ def sql_store_document(doc)
257
+ @dbh.do("INSERT INTO docs VALUES (?, ?, ?, ?)",
258
+ doc.doc_id, doc.title, doc.memo, doc.external?)
259
+
260
+ doc.fragments.each_with_index do | frag, i |
261
+ @dbh.do("INSERT INTO fragments VALUES(?, ?, ?)",
262
+ doc.doc_id, i, frag)
263
+ end
264
+ end
265
+ end
266
+ end
267
+ end