weft-qda 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/lib/weft.rb +21 -0
  2. data/lib/weft/WEFT-VERSION-STRING.rb +1 -0
  3. data/lib/weft/application.rb +130 -0
  4. data/lib/weft/backend.rb +39 -0
  5. data/lib/weft/backend/marshal.rb +26 -0
  6. data/lib/weft/backend/mysql.rb +267 -0
  7. data/lib/weft/backend/n6.rb +366 -0
  8. data/lib/weft/backend/sqlite.rb +633 -0
  9. data/lib/weft/backend/sqlite/category_tree.rb +104 -0
  10. data/lib/weft/backend/sqlite/schema.rb +152 -0
  11. data/lib/weft/backend/sqlite/upgradeable.rb +55 -0
  12. data/lib/weft/category.rb +157 -0
  13. data/lib/weft/coding.rb +355 -0
  14. data/lib/weft/document.rb +118 -0
  15. data/lib/weft/filters.rb +243 -0
  16. data/lib/weft/wxgui.rb +687 -0
  17. data/lib/weft/wxgui/category.xpm +26 -0
  18. data/lib/weft/wxgui/dialogs.rb +128 -0
  19. data/lib/weft/wxgui/document.xpm +25 -0
  20. data/lib/weft/wxgui/error_handler.rb +52 -0
  21. data/lib/weft/wxgui/inspectors.rb +361 -0
  22. data/lib/weft/wxgui/inspectors/category.rb +165 -0
  23. data/lib/weft/wxgui/inspectors/codereview.rb +275 -0
  24. data/lib/weft/wxgui/inspectors/document.rb +139 -0
  25. data/lib/weft/wxgui/inspectors/imagedocument.rb +56 -0
  26. data/lib/weft/wxgui/inspectors/script.rb +35 -0
  27. data/lib/weft/wxgui/inspectors/search.rb +265 -0
  28. data/lib/weft/wxgui/inspectors/textcontrols.rb +304 -0
  29. data/lib/weft/wxgui/lang.rb +17 -0
  30. data/lib/weft/wxgui/lang/en.rb +45 -0
  31. data/lib/weft/wxgui/mondrian.xpm +44 -0
  32. data/lib/weft/wxgui/search.xpm +25 -0
  33. data/lib/weft/wxgui/sidebar.rb +498 -0
  34. data/lib/weft/wxgui/utilities.rb +148 -0
  35. data/lib/weft/wxgui/weft16.xpm +31 -0
  36. data/lib/weft/wxgui/workarea.rb +249 -0
  37. data/test/001-document.rb +196 -0
  38. data/test/002-category.rb +138 -0
  39. data/test/003-code.rb +370 -0
  40. data/test/004-application.rb +52 -0
  41. data/test/006-filters.rb +139 -0
  42. data/test/009a-backend_sqlite_basic.rb +280 -0
  43. data/test/009b-backend_sqlite_complex.rb +175 -0
  44. data/test/009c_backend_sqlite_bench.rb +81 -0
  45. data/test/010-backend_nudist.rb +5 -0
  46. data/test/all-tests.rb +1 -0
  47. data/test/manual-gui-script.txt +24 -0
  48. data/test/testdata/autocoding-test.txt +15 -0
  49. data/test/testdata/iso-8859-1.txt +5 -0
  50. data/test/testdata/sample_doc.txt +19 -0
  51. data/test/testdata/search_results.txt +1254 -0
  52. data/test/testdata/text1-dos-ascii.txt +2 -0
  53. data/test/testdata/text1-unix-utf8.txt +2 -0
  54. data/weft-qda.rb +28 -0
  55. metadata +96 -0
@@ -0,0 +1,366 @@
1
+ # Read-only storage backend using N6's native file format
2
+ module QDA
3
+ module Backend
4
+ module N6
5
+ # number of seconds between 1/1/1900 and 1/1/1970
6
+ SEVENTY_YEARS = 2208985187
7
+
8
+ def start(args)
9
+ @basedir = args[:basedir]
10
+ @npr = NPReader.new()
11
+ @doc_dbid_counter = 0
12
+ load_docs()
13
+ load_nodes()
14
+ end
15
+
16
+ def get_all_docs
17
+ @docs
18
+ end
19
+ def get_doc(title)
20
+ @docs.find { | d | d.title == title }
21
+ end
22
+
23
+ def get_all_categories
24
+ @root_node
25
+ end
26
+
27
+ private
28
+ PARSE_DOCS = /^\("((?:[^"]|\\")*)" (\d+) (\d+) (?#
29
+ )(NIL|(?:\([0-9 ]+\))) (?#
30
+ )\((\d{10}) \. (\d{10})\) (?#
31
+ )"((?:[^"]|\\")*)"/
32
+
33
+ def load_docs()
34
+ @docs = []
35
+ File.foreach(File.join(@basedir, 'DATABASE/docsys') ) do | docline |
36
+ if matches = PARSE_DOCS.match(docline)
37
+ doc_id = matches[7]
38
+
39
+ parsed = {
40
+ :title => matches[1],
41
+ :external => ( matches[2] == "0" ? true : false ),
42
+ :unitlength => matches[3],
43
+ :crt_date => matches[5],
44
+ :mod_date => matches[6],
45
+ :doc_id => matches[7],
46
+ :source => self
47
+ }
48
+ docfacts = nil
49
+ fragments = []
50
+
51
+ # read the breaking of the documents into chunks
52
+ File.open(File.join(@basedir,
53
+ 'DATABASE/DOCFACTS',
54
+ doc_id ) ) do | dff |
55
+ docfacts = @npr.parse(dff.read)
56
+ end
57
+
58
+ # read the source document in chunks defined in docfacts
59
+ # not sure why each fragment seems to be made up of two
60
+ # chunks
61
+ File.open( File.join(@basedir,
62
+ 'DATABASE/DOCFILES',
63
+ doc_id ) ) do | df |
64
+ docfacts.values.each do | chunk |
65
+ # bit 1 - seems longer
66
+ fragments.push(df.read(chunk.values[0].to_i))
67
+ # bit 2 - generally shorter
68
+ fragments[-1] << df.read(chunk.values[2].to_i)
69
+ fragments[-1].sub(/[\n\r]+$/, '')
70
+ end
71
+ end
72
+ parsed[:fragments] = fragments
73
+
74
+ memo = ''
75
+ if FileTest.exist?( File.join( @basedir,
76
+ 'DATABASE/DOCMEMS',
77
+ doc_id) )
78
+ File.open(File.join(@basedir,
79
+ 'DATABASE/DOCMEMS',
80
+ doc_id) ) do | dm |
81
+ memo = dm.read()
82
+ end
83
+ end
84
+ crt_date = Time.at( parsed[:crt_date].to_i - SEVENTY_YEARS )
85
+ mod_date = Time.at( parsed[:mod_date].to_i - SEVENTY_YEARS )
86
+
87
+ doc = Document.new(parsed[:title], '', memo,
88
+ crt_date, mod_date)
89
+
90
+ if parsed[:external]
91
+ doc.instance_eval { @external = true }
92
+ end
93
+
94
+ class << doc
95
+ def chunks_to_vector(start, length)
96
+ return 0, 0 if @external
97
+
98
+ this_start = @chunkmap[0, start].inject(0) do | tot, ck |
99
+ tot += ck
100
+ end
101
+ this_offset = @chunkmap[start, length].inject(0) do | tot, ck |
102
+ tot += ck
103
+ end
104
+
105
+ return this_start, this_offset
106
+ end
107
+ def append(text)
108
+ @chunkmap ||= []
109
+ len = super(text)
110
+ @chunkmap.push(len)
111
+ end
112
+ end
113
+ doc.dbid = next_doc_dbid
114
+
115
+ fragments.each do | frag |
116
+ doc.append(frag)
117
+ end
118
+ @docs.push(doc)
119
+ end
120
+ end
121
+ end
122
+
123
+ def next_doc_dbid
124
+ token = @doc_dbid_counter += 1
125
+ # token.to_s
126
+ end
127
+
128
+ PARSE_INDEX = /^\((\d+|NIL) "([^"]*)" (?#
129
+ )(?:(NIL)|(?:"((?:[^"]|\\")*)")) (?# comment
130
+ )(?:(NIL)|(?:"((?:[^"]|\\")*)")) (?# "memoreference
131
+ )\((\d{10}) \. (\d{10})\) (.*?) (\d+)\) $/
132
+ def load_nodes()
133
+ @nodes = []
134
+ curr_node = nil
135
+ @root_node = nil
136
+ content = ''
137
+ pending = ''
138
+
139
+ # hash of nodes -> number of pending children
140
+ tree_kids = {}
141
+ File.foreach( File.join(@basedir,
142
+ 'DATABASE/indexsys') ) do | line |
143
+ next if line =~ /^\s*$/
144
+
145
+ unless matches = PARSE_INDEX.match(pending + line)
146
+ pending += line
147
+ next
148
+ end
149
+
150
+ pending = ''
151
+
152
+ parsed = {
153
+ :child_id => matches[1],
154
+ :title => matches[2],
155
+ :comment => matches[3] || matches[4],
156
+ :memo => '',
157
+
158
+ :big_fig1 => matches[7],
159
+ :big_fig2 => matches[8],
160
+
161
+ :codes => matches[9],
162
+ :ccount => matches[10],
163
+ :source => self }
164
+
165
+ # load the node memo if there is one
166
+ if matches[6]
167
+ memofile = File.join(@basedir, 'DATABASE/NODEMEMS', matches[6])
168
+ # they don't necessarily have a memo file, in which case memo = ''
169
+ if FileTest.exist?(memofile)
170
+ File.open(memofile) { | nm | parsed[:memo] = nm.read() }
171
+ end
172
+ end
173
+
174
+ # create a node object to represent the code
175
+ # first of all, see if it's the root node (no parent)
176
+ if curr_node.nil?
177
+ @root_node = Category.new(parsed[:title], nil)
178
+ # yuk
179
+ load_node_codes(@root_node,
180
+ parsed[:codes])
181
+
182
+ @nodes.push(@root_node)
183
+ # root node is always looking for additional children -
184
+ # the default parent.
185
+ tree_kids[@root_node] = -1
186
+ curr_node = @root_node
187
+ else # it's not the root node
188
+ # go back up the tree until we find a node which needs
189
+ # more children
190
+ until tree_kids[curr_node] != 0
191
+ curr_node = curr_node.parent
192
+ end
193
+ tree_kids[curr_node] -= 1
194
+
195
+ parsed[:parent] = curr_node
196
+ new_node = Category.new(parsed[:title], parsed[:parent])
197
+ load_node_codes(new_node, parsed[:codes])
198
+
199
+ # get ready for the next round
200
+ tree_kids[new_node] = parsed[:ccount].to_i
201
+ curr_node = new_node
202
+ @nodes.push(new_node)
203
+ end
204
+ end
205
+ end
206
+
207
+
208
+ # parse which parts of which documents are coded by category +category+
209
+ def load_node_codes(category, codesrc)
210
+ return if codesrc == 'NIL'
211
+ unless parsed = @npr.parse(codesrc)
212
+ raise "cannot parse #{codesrc} for node codes"
213
+ end
214
+
215
+ codes = {}
216
+ parsed.values.each do | val |
217
+ # val.values[0] is the title of the coded document
218
+ doc = get_doc(val.values[0])
219
+ # the segments of the document which are coded
220
+ val.values[1..-1].each do | code |
221
+ start = code.values[0].to_i
222
+ length = code.values[2].to_i - start + 1
223
+ # we have to convert these chunks into character-indexes
224
+ conv = doc.chunks_to_vector(start, length)
225
+ # can be 0-length coded when applied to external doc
226
+ if conv[1] == 0
227
+ next
228
+ end
229
+ category.code( doc.dbid, *conv )
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ # NPReader.rb - Nested parenthesis (LISP syntax) parser.
236
+ # Copyright (C) 2001 Gordon James Miller
237
+
238
+ # This library is free software; you can redistribute it and/or
239
+ # modify it under the terms of the GNU Lesser General Public License
240
+ # as published by the Free Software Foundation; either version 2.1
241
+ # of the License, or (at your option) any later version.
242
+
243
+ # A container for strings that reside between matching parenthesis. Each
244
+ # instance of this class contains a list of values that are either
245
+ # strings or other NPNode objects representing nesting.
246
+
247
+ class NPNode
248
+ @@NORMAL_MODE = 0
249
+ @@QUOTE_MODE = 1
250
+
251
+ # The members of this group.
252
+ attr_reader :values
253
+
254
+ # The parent of this group.
255
+ attr_reader :parent
256
+
257
+ # The level of this group. This is set by the constructor and is
258
+ # based upon the level of the parent.
259
+ attr_reader :level
260
+
261
+ # The state of this group. This will either be NORMAL or QUOTE.
262
+ # In QUOTE, whitespace is absorbed.
263
+ attr_reader :state
264
+
265
+ def close
266
+ if ( @curval.size > 0 ) then
267
+ @values.push( @curval.to_s )
268
+ @curval.clear
269
+ end
270
+ end
271
+
272
+ # Initialize a new instance that has the specified parent. The
273
+ # parent can be nil, in which case it is assumed that this is
274
+ # the top level node. If the parent is not nil, then this
275
+ # object is added as a child to the parent.
276
+ def initialize (parent)
277
+
278
+ @parent = parent
279
+
280
+ if ( @parent != nil ) then
281
+ @parent.push(self)
282
+ end
283
+
284
+ @state = @@NORMAL_MODE
285
+ @level = ( parent == nil ) ? 0 : @parent.level + 1
286
+
287
+ @values = Array.new
288
+ @curval = Array.new
289
+ end
290
+
291
+ def push(ch)
292
+ if ( ch.kind_of? NPNode ) then
293
+ # If this is an NPNode instance, just add it to the end of the
294
+ # values array.
295
+ @values.push(ch)
296
+
297
+ else
298
+ if ( ch == "'" || ch == "\"" ) then
299
+ # If this is a double quote, then we have to start a new
300
+ # value, toggle the mode.
301
+ @state = ( @state + 1 ) % 2
302
+ close
303
+
304
+ elsif ( (@state != @@QUOTE_MODE) && ch =~ /[ \t\n\r]/ ) then
305
+ # If this is a whitespace character and the length is
306
+ # greater than zero, push the current value on the values
307
+ # array and clear the current values string.
308
+ close
309
+
310
+ else
311
+ # Otherwise go ahead and push the character onto the end of
312
+ # the current value.
313
+ @curval.push( ch )
314
+ end
315
+ end
316
+ end
317
+
318
+ def to_s
319
+ str = Array.new()
320
+ return "(#{@values.join(' ')})"
321
+ end
322
+ end
323
+
324
+ # An implementation of a nested parenthesis reader. This implementation
325
+ # constructs an AST from a data source.
326
+ class NPReader
327
+
328
+ # Initialize a new instance of the reader. This does not start the
329
+ # parsing, that is done with the parse method.
330
+ def initialize ()
331
+ end
332
+
333
+ # Parse the data contained in the string and return the reference to the
334
+ # top level group.
335
+ def parse (string)
336
+
337
+ @curgroup = nil
338
+ @top = nil
339
+
340
+ string.each_byte { |byte|
341
+
342
+ ch = byte.chr
343
+
344
+ if ( ch == '(' ) then
345
+ @curgroup = NPNode.new(@curgroup)
346
+ if ( @top == nil ) then
347
+ @top = @curgroup
348
+ end
349
+
350
+ elsif ( ch == ')' ) then
351
+ @curgroup.close
352
+ @curgroup = @curgroup.parent
353
+
354
+ else
355
+ if ( @curgroup != nil ) then
356
+ @curgroup.push(ch)
357
+ end
358
+
359
+ end
360
+ }
361
+
362
+ return @top
363
+ end
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,633 @@
1
+ require 'sqlite'
2
+ # require 'sqlite3'
3
+ require 'strscan'
4
+ require 'tempfile'
5
+ require 'fileutils'
6
+ require 'base64'
7
+ require 'rexml/document'
8
+ # require 'iconv'
9
+
10
+ module QDA
11
+ # Storage backend using SQLite module - can use SQlite 3 or SQLite, but
12
+ # currently problems with SQLite 3 and non-ASCII characters. Will pick
13
+ # up whether sqlite or sqlite3 is available.
14
+ module Backend::SQLite
15
+
16
+ require 'weft/backend/sqlite/schema.rb'
17
+ require 'weft/backend/sqlite/upgradeable.rb'
18
+ require 'weft/backend/sqlite/category_tree.rb'
19
+ include Upgradeable
20
+
21
+ # if working with sqlite v2 with the sqlite-ruby v2, we need a
22
+ # couple of compatibility tweaks.
23
+ if defined?(::SQLite)
24
+ SQLITE_DB_CLASS = ::SQLite::Database
25
+ # Ruby-SQLite3 statements have a close() method, but Ruby-SQLite
26
+ # v 2 don't - so we supply a dummy method for when using v2
27
+ class ::SQLite::Statement
28
+ def close(); end
29
+ end
30
+ # SQLite3 introduced this more ruby-ish notation
31
+ class ::SQLite::Database::FunctionProxy
32
+ alias :result= :set_result
33
+ end
34
+ elsif defined?(::SQLite3)
35
+ SQLITE_DB_CLASS = ::SQLite3::Database
36
+ else
37
+ raise LoadError, "No SQlite database class loaded"
38
+ end
39
+
40
+ class Database < SQLITE_DB_CLASS
41
+ def initialize(file)
42
+ # super(file, :driver => "Native")
43
+ super(file)
44
+ self.results_as_hash = true
45
+ # self.type_translation = true
46
+ end
47
+
48
+ def undo_action()
49
+ @dbh.transaction do
50
+ @dbh.execute("SELECT * FROM undoable WHERE step = 1
51
+ ORDER BY step, actionid DESC") do | task |
52
+ @dbh.execute(task[0])
53
+ end
54
+ @dbh.execute("UPDATE undoable SET step = step -1")
55
+ @dbh.execute("DELETE FROM undoable WHERE step = 0")
56
+ end
57
+ end
58
+
59
+ def redo_action()
60
+ transaction do
61
+ execute("SELECT * FROM undoable WHERE step = -1
62
+ ORDER BY step, actionid DESC") do | task |
63
+ execute(task[0])
64
+ end
65
+ execute("DELETE FROM undoable WHERE step = -1")
66
+ execute("UPDATE undoable SET step = step + 1")
67
+ end
68
+ end
69
+
70
+ def date_freeze(date)
71
+ date ? date.strftime('%Y-%m-%d %H:%M:%S') : ''
72
+ end
73
+
74
+ def date_thaw(str)
75
+ return nil if str.empty?
76
+ return Time.local( *str.split(/[- :]/) )
77
+ end
78
+ end
79
+
80
+ attr_reader :dbh, :dbfile
81
+
82
+ # load up the database connection. A hash argument containing the
83
+ # key :dbfile should be supplied. If this is +nil+, then a
84
+ # temporary storage will be used
85
+ def start(args)
86
+ if ! args.key?(:dbfile)
87
+ raise ArgumentError, "Must specify SQLite dbfile to load from"
88
+ end
89
+
90
+ @dbfile = args[:dbfile]
91
+ if @dbfile and ! File.exists?(@dbfile)
92
+ raise RuntimeError, "Tried to open an non-existent database"
93
+ end
94
+
95
+ tmp_fname = @dbfile ? File::basename(@dbfile) : 'Weft'
96
+ tmpfile = Tempfile.new(tmp_fname || 'Weft')
97
+ tmpfile.close(false) # don't delete
98
+
99
+ @tmpfile = tmpfile.path
100
+ if @dbfile
101
+ FileUtils.copy(@dbfile, @tmpfile)
102
+ end
103
+ @dbh = Database.new(@tmpfile)
104
+ # if opening from an existing file, check and do any upgrding
105
+ # required from older versions
106
+ do_version_format_upgrading() if @dbfile
107
+ undirty!
108
+ end
109
+
110
+ def connect(args)
111
+ @dbh = args[:dbh]
112
+ end
113
+
114
+ def end(force = false)
115
+ @cat_tree = nil
116
+ @dbh.close()
117
+ end
118
+
119
+ def save(target = @dbfile)
120
+ if target.nil?
121
+ raise RuntimeError,
122
+ "No previously saved file, and no named supplied for save"
123
+ end
124
+ @dbh.close
125
+ @dbfile = target
126
+ FileUtils.copy(@tmpfile, @dbfile)
127
+ @dbh = Database.new(@tmpfile)
128
+ undirty!
129
+ end
130
+
131
+ # roll the current state back to the last-saved state.
132
+ def revert()
133
+ @dbh.close()
134
+ FileUtils.copy(@dbfile, @tmpfile)
135
+ @dbh = Database.new(@tmpfile)
136
+ end
137
+
138
+
139
+ # hint to do the next series of actions as a batch
140
+ def batch
141
+ @dbh.transaction { yield }
142
+ end
143
+
144
+ def cat_tree
145
+ return @cat_tree if @cat_tree
146
+ xml = @dbh.get_first_value("SELECT xml FROM category_structure")
147
+ if xml and xml.length > 0
148
+ @cat_tree = CategoryTree.load(xml)
149
+ else
150
+ @cat_tree = CategoryTree.new()
151
+ end
152
+ end
153
+
154
+ # private :cat_tree
155
+ # get every doc
156
+ def get_all_docs()
157
+ docs = []
158
+ @dbh.execute("SELECT doctitle, docid FROM document") do | row |
159
+ doc = Document.new(row['doctitle'])
160
+ doc.dbid = row['docid'].to_i
161
+ docs.push(doc)
162
+ end
163
+ docs
164
+ end
165
+
166
+ # fetch the document identified by the string ident
167
+ def get_doc(ident)
168
+ doc = nil
169
+ @dbh.transaction do
170
+ stmt = nil
171
+ if ident.kind_of?(Fixnum) || ident =~ /^\d+$/
172
+ stmt = @dbh.prepare("SELECT * FROM document WHERE docid = ?")
173
+ else
174
+ stmt = @dbh.prepare("SELECT * FROM document WHERE doctitle = ?")
175
+ end
176
+ stmt.execute!(ident) do | r |
177
+ doc = Document.new(r['doctitle'].dup,
178
+ r['doctext'].dup,
179
+ r['docmemo'].dup,
180
+ @dbh.date_thaw(r['created_date']),
181
+ @dbh.date_thaw(r['modified_date']) )
182
+ doc.dbid = r['docid'].to_i
183
+ end
184
+ raise "Not found: #{ident}" if doc.nil?
185
+
186
+ stmt.close()
187
+ end
188
+ return doc
189
+ end
190
+ alias :get_document :get_doc
191
+
192
+ def save_preference(pref_name, pref_value)
193
+ frozen_value = Base64.encode64( Marshal.dump( pref_value) )
194
+ @dbh.transaction do
195
+ @dbh.execute("INSERT OR REPLACE INTO app_preference
196
+ VALUES (?, ?)",
197
+ pref_name, frozen_value )
198
+ end
199
+ dirty!
200
+ end
201
+
202
+ def get_preference(pref_name)
203
+ frozen_pref = nil
204
+ @dbh.transaction do
205
+ @dbh.execute("SELECT value FROM app_preference
206
+ WHERE name = ? ", pref_name ) do | r |
207
+ frozen_pref = r['value']
208
+ end
209
+ end
210
+ return nil if frozen_pref.nil?
211
+ return Marshal.load( Base64.decode64(frozen_pref) )
212
+ end
213
+
214
+ def save_document(doc)
215
+ raise TypeError unless doc.kind_of? QDA::Document
216
+ @dbh.transaction { _save_document(doc) }
217
+ dirty!
218
+ doc
219
+ end
220
+
221
+ def _save_document(doc)
222
+ if doc.dbid
223
+ @dbh.execute("UPDATE document
224
+ SET doctitle = ?, doctext = ?,
225
+ docmemo = ?, modified_date = ?
226
+ WHERE docid = ? ",
227
+ doc.title, doc.text, doc.memo,
228
+ @dbh.date_freeze( Time.now() ),
229
+ doc.dbid)
230
+ else
231
+ @dbh.execute("INSERT INTO document
232
+ VALUES(NULL, ?, ?, ?, ?, ?)",
233
+ doc.title, doc.text, doc.memo,
234
+ @dbh.date_freeze(doc.create_date),
235
+ @dbh.date_freeze( Time.now() ) )
236
+ doc.dbid = @dbh.last_insert_row_id().to_i
237
+ end
238
+ end
239
+
240
+ # delete teh document identified by +dbid+ from the database
241
+ def delete_document(dbid)
242
+ @dbh.transaction do
243
+ @dbh.execute("DELETE FROM document WHERE docid = ?", dbid)
244
+ end
245
+ dirty!
246
+ end
247
+
248
+ # retrieve the category with the internal id +catid+, along with
249
+ # its codes. If +get_structure+ is set to a true value then the
250
+ # category's children will also be retrieved from the database
251
+ def get_category(catid, get_structure = false)
252
+ catid = catid.to_i if catid =~ /^\d+$/
253
+ raise "Invalid id #{catid.inspect}" unless catid.kind_of?(Fixnum)
254
+
255
+ category = nil
256
+ stmt = @dbh.prepare("SELECT * FROM category WHERE catid = ?")
257
+ stmt.execute!(catid) do | r |
258
+ parent = get_category_parent(catid)
259
+ category = Category.new(r['catname'], parent, r['catdesc'])
260
+ category.dbid = catid
261
+ end
262
+ raise "No category found matching id '#{catid}'" unless category
263
+ stmt.close()
264
+
265
+ get_codes_for_category(category)
266
+ get_and_build_children(category) if get_structure
267
+ category
268
+ end
269
+
270
+ # gets the root category named +name+
271
+ def get_root_category(name)
272
+ root = cat_tree.roots.find { | r | r.name == name }
273
+ raise "Not found, root category #{name.inspect}" unless root
274
+ return get_category(root.dbid)
275
+ end
276
+
277
+ # fetch categories by relative or absolute paths. Returns an
278
+ # array of categories
279
+ def get_categories_by_path(path)
280
+ # cos it should be quicker ...
281
+ if path =~ /\//
282
+ return cat_tree.find(path).map do | found |
283
+ get_category(found.dbid)
284
+ end
285
+ else
286
+ return get_categories_by_name(path)
287
+ end
288
+ end
289
+
290
+ # fetch categories by partial names. This is currently
291
+ # case-insensitive by default. Returns an array of categories
292
+ # whose names match.
293
+ def get_categories_by_name(namebit, insensitive = true)
294
+ stmt = nil
295
+ if insensitive
296
+ stmt = @dbh.prepare("SELECT catid FROM category
297
+ WHERE UPPER(catname) LIKE ?
298
+ AND parent >= 0" )
299
+ namebit = namebit.upcase
300
+ else
301
+ stmt = @dbh.prepare("SELECT catid FROM category
302
+ WHERE catname GLOB ?
303
+ AND parent >= 0" )
304
+
305
+ end
306
+ categories = []
307
+ @dbh.transaction do
308
+ stmt.execute!(namebit + "%") do | r |
309
+ categories.push( get_category( r['catid'] ) )
310
+ end
311
+ stmt.close()
312
+ end
313
+ return categories
314
+ end
315
+
316
+ def is_descendant?(ancestor, descendant)
317
+ cat_tree.is_descendant?(ancestor.dbid, descendant.dbid)
318
+ end
319
+
320
+ # builds the tree structure below +category+, modifying
321
+ # +category+ in place. After this call, the retrieved structure
322
+ # is available as the +children+ property of the category.
323
+ def get_and_build_children(category)
324
+ # this duplicates stuff below
325
+ append_f = Proc.new do | parent, elem |
326
+ cat = Category.new(elem.name, parent)
327
+ cat.dbid = elem.dbid
328
+ elem.children { | c | append_f.call(cat, c) }
329
+ end
330
+
331
+ cat_tree[category.dbid].children do | first_child |
332
+ append_f.call(category, first_child)
333
+ end
334
+ end
335
+ private :get_and_build_children
336
+
337
+ # applies the codes to category +cat+
338
+ def get_codes_for_category(cat)
339
+ @dbh.execute("SELECT docid, offset, length
340
+ FROM code
341
+ WHERE catid = ? ", cat.dbid) do | row |
342
+ cat.code( row['docid'].to_i,
343
+ row['offset'].to_i,
344
+ row['length'].to_i )
345
+ end
346
+ return cat
347
+ end
348
+
349
+ # looks up the string indices of the document and returns the
350
+ # appropriate text fragments as an array
351
+ # returns a hash keyed on document title where the values are an
352
+ # array of fragments in order of offset from the start of the document
353
+ def get_text_at_category(cat)
354
+ vectors = FragmentTable.new()
355
+ @dbh.execute("SELECT document.doctitle AS doctitle,
356
+ code.docid AS docid,
357
+ code.offset AS offset, code.length,
358
+ SUBSTR(document.doctext,
359
+ code.offset + 1, code.length) AS fragment
360
+ FROM document, code
361
+ WHERE code.catid = ?
362
+ AND code.docid = document.docid
363
+ ORDER BY code.docid, code.offset", cat.dbid ) do | r |
364
+ vectors.add( Fragment.new( r['fragment'],
365
+ r['doctitle'],
366
+ r['offset'].to_i,
367
+ r['docid'].to_i ) )
368
+ end
369
+ vectors
370
+ end
371
+
372
+ def get_category_parent(catid)
373
+ if cat_tree[catid].parent
374
+ return get_category(cat_tree[catid].parent)
375
+ else
376
+ return nil
377
+ end
378
+ end
379
+
380
+ # fetches all the categories in a tree structure, starting from the right
381
+ def get_all_categories()
382
+ build_cat = Proc.new do | elem, parent |
383
+ cat = Category.new( elem.name, parent )
384
+ cat.dbid = elem.dbid
385
+ elem.children.each { | ch | build_cat.call(ch, cat) }
386
+ cat
387
+ end
388
+ # return the Category Tree (internal storage) as a tree of
389
+ # actual Categories
390
+ cat_tree.roots.map { | root | build_cat.call(root, nil) }
391
+ end
392
+
393
+ # saves the category
394
+ def save_category(cat)
395
+ @dbh.transaction { _save_category(cat) }
396
+ dirty!
397
+ cat
398
+ end
399
+
400
+ def _save_category(cat)
401
+ # only resave the tree structure if nec,
402
+ xml_needs_update = false
403
+
404
+ # updating an existing category
405
+ if cat.dbid
406
+ # check for re-parenting or renaming
407
+ child = cat_tree[cat.dbid]
408
+
409
+ if child.parent != cat.parent.dbid
410
+ cat_tree.move(child.dbid, cat.parent.dbid)
411
+ xml_needs_update = true
412
+ end
413
+
414
+ if child.name != cat.name
415
+ child.name = cat.name
416
+ xml_needs_update = true
417
+ end
418
+ @dbh.execute("DELETE FROM code WHERE catid = ?", cat.dbid)
419
+ @dbh.execute("UPDATE category
420
+ SET catname = ?,
421
+ catdesc = ?,
422
+ parent = ?,
423
+ modified_date = ?
424
+ WHERE catid = ? ",
425
+ cat.name,
426
+ cat.memo,
427
+ ( cat.parent ? cat.parent.dbid : nil),
428
+ @dbh.date_freeze( Time.now ),
429
+ cat.dbid)
430
+ # adding a new category
431
+ else
432
+ parentid = cat.parent ? cat.parent.dbid : nil
433
+ @dbh.execute("INSERT INTO category
434
+ VALUES(NULL, ?, ?, ?, ?, ?)",
435
+ cat.name, cat.memo, parentid,
436
+ @dbh.date_freeze( Time.now ),
437
+ @dbh.date_freeze( Time.now ) )
438
+ cat.dbid = @dbh.last_insert_row_id().to_i
439
+
440
+ if cat.parent
441
+ cat_tree.add(cat.parent.dbid, cat.dbid, cat.name)
442
+ else
443
+ cat_tree.add(nil, cat.dbid, cat.name)
444
+ end
445
+ xml_needs_update = true
446
+ end
447
+
448
+ stmt_code = @dbh.prepare("INSERT INTO code VALUES(?, ?, ?, ?)")
449
+ cat.codes.each do | docid, vecs |
450
+ vecs.each do | vec |
451
+ stmt_code.execute( cat.dbid, vec.docid, vec.offset, vec.length )
452
+ end
453
+ end
454
+ stmt_code.close()
455
+
456
+ if xml_needs_update
457
+ @dbh.execute( "UPDATE category_structure SET xml = ? ",
458
+ cat_tree.serialise())
459
+ end
460
+ end
461
+
462
+ # deletes the category +category+. If +recursive+ is false then
463
+ # any children of +category+ will be reattached to the deleted
464
+ # category's parent. If +recursive+ is true (default), then all
465
+ # descendants will be deleted.
466
+ # Returns a list of categories that were actually deleted.
467
+ def delete_category(cat, recursive = true)
468
+ return unless cat.dbid
469
+ deleted_items = []
470
+ # TODO not all items being returned in list
471
+ if recursive
472
+ me = cat_tree[cat.dbid]
473
+ me.children.each do | child |
474
+ deleted_items += delete_category(child, true)
475
+ end
476
+ cat_tree.remove(cat.dbid)
477
+ deleted_items << cat
478
+ else
479
+ raise NotImplementedError,
480
+ 'Non-recursive deletion not implemented'
481
+ end
482
+ @dbh.transaction do
483
+ @dbh.execute("DELETE FROM category WHERE catid = ? ", cat.dbid)
484
+ xml = cat_tree.serialise()
485
+ @dbh.execute("UPDATE category_structure SET xml = ?", xml)
486
+ end
487
+ dirty!
488
+ return deleted_items
489
+ end
490
+
491
+ MAGIC_REV_INDEX_ID = -2
492
+ # adds the reverse indexes for +words+ to the existing reverse
493
+ # indexes.
494
+ def save_reverse_index(docid, words, prog_bar = nil)
495
+ stmt_wordid = @dbh.prepare("SELECT catid FROM category
496
+ WHERE catname = ? AND parent = ? ")
497
+ stmt_insert = @dbh.prepare("INSERT INTO category
498
+ VALUES(NULL, ?, ?, ?, ?, ?)")
499
+ stmt_code = @dbh.prepare("INSERT INTO code VALUES(?, ?, ?, ?)")
500
+
501
+ @dbh.transaction do
502
+ words.each do | word, locations |
503
+ wordid = nil
504
+ stmt_wordid.execute!(word, MAGIC_REV_INDEX_ID) do | r |
505
+ wordid = r[0] # get first value
506
+ end
507
+ unless wordid
508
+ stmt_insert.execute( word, '', MAGIC_REV_INDEX_ID,
509
+ @dbh.date_freeze( Time.now ),
510
+ @dbh.date_freeze( Time.now ) )
511
+ wordid = @dbh.last_insert_row_id().to_s
512
+ end
513
+
514
+ locations.each do | loc |
515
+ stmt_code.execute(wordid, docid, loc, word.length)
516
+ end
517
+ prog_bar.next() if prog_bar
518
+ end
519
+ end # transaction
520
+ [stmt_wordid, stmt_insert, stmt_code].each { | s | s.close() }
521
+ end
522
+
523
+
524
+ # Delete all reverse word indexes associated with +docid+
525
+ def drop_reverse_indexes(docid)
526
+ @dbh.transaction do
527
+ stmt_del = @dbh.prepare("DELETE FROM code
528
+ WHERE docid = ? AND catid IN
529
+ (SELECT catid
530
+ FROM category
531
+ WHERE parent = ?) ")
532
+ stmt_del.execute(docid, MAGIC_REV_INDEX_ID)
533
+ stmt_del.close()
534
+ end
535
+ end
536
+
537
+
538
+ # should work for latin-1 characters
539
+ WORD_PATTERN = /^[\w\xC0-\xD6\xD8-\xF6\xF8-\xFF][\w\xC0-\xD6\xD8-\xF6\xF8-\xFF']+$/s
540
+
541
+ # returns a hash of document fragments. mods are additional
542
+ # arguments
543
+ def get_search_fragments(term, mods = {})
544
+ if term !~ WORD_PATTERN
545
+ return get_search_fragments_scan(term, mods)
546
+ else
547
+ return get_search_fragments_index(term, mods)
548
+ end
549
+ end
550
+
551
+ def get_search_fragments_scan(term, mods = {})
552
+ vectors = FragmentTable.new()
553
+ wrap = mods[:wrap_both] || 0
554
+ @dbh.execute("SELECT * FROM document WHERE doctext LIKE ?",
555
+ "%#{term}%") do | r |
556
+ doc_title = r['doctitle']
557
+ doc_id = r['docid'].to_i
558
+
559
+ # compile a search regexp
560
+ rx = mods[:whole_word] ?
561
+ '\b\w*.{0,%i}\b%s\b.{0,%i}\w*?\b' % [wrap, term, wrap] :
562
+ '\b\w*.{0,%i}%s.{0,%i}\w*?\b' % [wrap, term, wrap]
563
+
564
+ search = mods[:case_sensitive] ?
565
+ Regexp.new(rx, Regexp::MULTILINE) :
566
+ Regexp.new(rx, Regexp::MULTILINE|Regexp::IGNORECASE)
567
+
568
+ scanner = StringScanner.new(r['doctext'])
569
+ while scanner.scan_until(search)
570
+ offset = scanner.pos - scanner.matched_size
571
+ f = Fragment.new(scanner.matched, doc_title, offset, doc_id)
572
+ vectors.add(f)
573
+ end
574
+ end
575
+ vectors
576
+ end
577
+
578
+ # returns a hash of document fragments. mods are additional
579
+ # arguments.
580
+ # It is currently case-insensitive and searches for parts of
581
+ # words. SQL needs to be altered below to change this via +mods+
582
+ def get_search_fragments_index(word, mods = {})
583
+ wrap = mods[:wrap_both] || 0
584
+
585
+ query = Schema::RINDEX_SEARCH_MODEL_QUERY.dup()
586
+
587
+ word = word.gsub(/'/, "''") + "%"
588
+ # if "there" shouldn't be matched when searching with "the"
589
+ if mods[:whole_word]
590
+ query.sub!(/LIKE :search/, "= :search")
591
+ word.sub!(/\%$/, '')
592
+ end
593
+
594
+ # SQLite GLOB is case-sensitive, LIKE isn't
595
+ if mods[:case_sensitive]
596
+ query.sub!(/LOWER\(category.catname\)/, 'category.catname')
597
+ query.sub!(/LIKE :search/, "GLOB :search")
598
+ word.sub!(/\%/, '*') # for GLOB
599
+ end
600
+
601
+ # something to hold the results
602
+ vectors = FragmentTable.new()
603
+
604
+ @dbh.transaction do
605
+ # old-style ? bind params seem to work better with sqlite-2
606
+ params = [ wrap, wrap, wrap, wrap, MAGIC_REV_INDEX_ID, word ]
607
+ @dbh.execute(query, *params) do | r |
608
+ f = Fragment.new( r['snip'], r['doctitle'],
609
+ r['start_at'].to_i, r['docid'].to_i )
610
+ vectors.add(f)
611
+ end
612
+ end
613
+
614
+ vectors
615
+ end
616
+
617
+ # destructively reinstalls the schema
618
+ def install_clean()
619
+ if @dbfile and FileTest.exist?(@dbfile)
620
+ @dbh.close()
621
+ File.delete(@dbfile)
622
+ @dbh = Database.new(@dbfile)
623
+ end
624
+
625
+ # transaction make a big speed difference here
626
+ @dbh.transaction do
627
+ @dbh.execute_batch(Schema::SCHEMA_TABLES)
628
+ @dbh.execute_batch(Schema::SCHEMA_TRIGGERS)
629
+ @dbh.execute_batch(Schema::SCHEMA_INDEXES)
630
+ end
631
+ end
632
+ end
633
+ end