weft-qda 0.9.6 → 0.9.8
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/weft.rb +16 -1
- data/lib/weft/WEFT-VERSION-STRING.rb +1 -1
- data/lib/weft/application.rb +17 -74
- data/lib/weft/backend.rb +6 -32
- data/lib/weft/backend/sqlite.rb +222 -164
- data/lib/weft/backend/sqlite/category_tree.rb +52 -48
- data/lib/weft/backend/sqlite/database.rb +57 -0
- data/lib/weft/backend/sqlite/upgradeable.rb +7 -0
- data/lib/weft/broadcaster.rb +90 -0
- data/lib/weft/category.rb +139 -47
- data/lib/weft/codereview.rb +160 -0
- data/lib/weft/coding.rb +74 -23
- data/lib/weft/document.rb +23 -10
- data/lib/weft/exceptions.rb +10 -0
- data/lib/weft/filters.rb +47 -224
- data/lib/weft/filters/indexers.rb +137 -0
- data/lib/weft/filters/input.rb +118 -0
- data/lib/weft/filters/output.rb +101 -0
- data/lib/weft/filters/templates.rb +80 -0
- data/lib/weft/filters/win32backtick.rb +246 -0
- data/lib/weft/query.rb +169 -0
- data/lib/weft/wxgui.rb +349 -294
- data/lib/weft/wxgui/constants.rb +43 -0
- data/lib/weft/wxgui/controls.rb +6 -0
- data/lib/weft/wxgui/controls/category_dropdown.rb +192 -0
- data/lib/weft/wxgui/controls/category_tree.rb +314 -0
- data/lib/weft/wxgui/controls/document_list.rb +97 -0
- data/lib/weft/wxgui/controls/multitype_control.rb +37 -0
- data/lib/weft/wxgui/{inspectors → controls}/textcontrols.rb +235 -64
- data/lib/weft/wxgui/dialogs.rb +144 -41
- data/lib/weft/wxgui/error_handler.rb +116 -36
- data/lib/weft/wxgui/exceptions.rb +7 -0
- data/lib/weft/wxgui/inspectors.rb +61 -208
- data/lib/weft/wxgui/inspectors/category.rb +19 -16
- data/lib/weft/wxgui/inspectors/codereview.rb +90 -132
- data/lib/weft/wxgui/inspectors/document.rb +12 -8
- data/lib/weft/wxgui/inspectors/imagedocument.rb +56 -56
- data/lib/weft/wxgui/inspectors/query.rb +284 -0
- data/lib/weft/wxgui/inspectors/script.rb +147 -23
- data/lib/weft/wxgui/lang/en.rb +69 -0
- data/lib/weft/wxgui/sidebar.rb +90 -432
- data/lib/weft/wxgui/utilities.rb +70 -91
- data/lib/weft/wxgui/workarea.rb +150 -43
- data/share/icons/category.ico +0 -0
- data/share/icons/category.xpm +109 -0
- data/share/icons/codereview.ico +0 -0
- data/share/icons/codereview.xpm +54 -0
- data/share/icons/d_and_c.xpm +126 -0
- data/share/icons/document.ico +0 -0
- data/share/icons/document.xpm +70 -0
- data/share/icons/project.ico +0 -0
- data/share/icons/query.ico +0 -0
- data/share/icons/query.xpm +56 -0
- data/{lib/weft/wxgui → share/icons}/search.xpm +0 -0
- data/share/icons/weft.ico +0 -0
- data/share/icons/weft.xpm +62 -0
- data/share/icons/weft16.ico +0 -0
- data/share/icons/weft32.ico +0 -0
- data/share/templates/category_plain.html +18 -0
- data/share/templates/codereview_plain.html +18 -0
- data/share/templates/document_plain.html +13 -0
- data/share/templates/document_plain.txt +7 -0
- data/test/001-document.rb +55 -36
- data/test/002-category.rb +81 -6
- data/test/003-code.rb +8 -4
- data/test/004-application.rb +13 -34
- data/test/005-query_review.rb +139 -0
- data/test/006-filters.rb +54 -42
- data/test/007-output_filters.rb +113 -0
- data/test/009a-backend_sqlite_basic.rb +95 -24
- data/test/009b-backend_sqlite_complex.rb +43 -62
- data/test/009c_backend_sqlite_bench.rb +5 -10
- data/test/053-doc_inspector.rb +46 -0
- data/test/055-query_window.rb +50 -0
- data/test/all-tests.rb +1 -0
- data/test/test-common.rb +19 -0
- data/test/testdata/empty.qdp +0 -0
- data/test/testdata/simple with space.pdf +0 -0
- data/test/testdata/simple.pdf +0 -0
- data/weft-qda.rb +40 -7
- metadata +74 -14
- data/lib/weft/wxgui/category.xpm +0 -26
- data/lib/weft/wxgui/document.xpm +0 -25
- data/lib/weft/wxgui/inspectors/search.rb +0 -265
- data/lib/weft/wxgui/mondrian.xpm +0 -44
- data/lib/weft/wxgui/weft16.xpm +0 -31
@@ -0,0 +1,160 @@
|
|
1
|
+
module QDA
|
2
|
+
# CodeReview is a class that is used for cross-tabulation of coding. It makes
|
3
|
+
# it possible to get statistics for the number of characters, passages and
|
4
|
+
# documents that are coded by both the row column and the
|
5
|
+
class CodeReview
|
6
|
+
attr_accessor :dbid, :count_method
|
7
|
+
attr_reader :cols, :rows, :contents
|
8
|
+
|
9
|
+
# A new CodeReview is empty when initialised
|
10
|
+
def initialize()
|
11
|
+
@cols, @rows, @contents = [], [], []
|
12
|
+
@count_method = :num_of_docs
|
13
|
+
end
|
14
|
+
|
15
|
+
# returns the total number of columns
|
16
|
+
def number_cols()
|
17
|
+
@cols.length
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns the index of the last column
|
21
|
+
def last_col()
|
22
|
+
@cols.length - 1
|
23
|
+
end
|
24
|
+
|
25
|
+
# takes a block, yielding each column Category and its index in turn
|
26
|
+
def each_col()
|
27
|
+
@cols.each_with_index { | col, i | yield col, i }
|
28
|
+
end
|
29
|
+
|
30
|
+
# add the Category +category+ as the last column
|
31
|
+
def add_col(category)
|
32
|
+
return nil unless category
|
33
|
+
return nil if @cols.include?(category)
|
34
|
+
@cols.push(category)
|
35
|
+
|
36
|
+
@rows.each_with_index do | row_cat, i |
|
37
|
+
@contents[i][last_col] = row_cat.codes.dup.join(category.codes)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Updates the column with the changed Category +category+. Useful in a
|
42
|
+
# persistent environment where user actions may have altered the coding.
|
43
|
+
def update_col(category)
|
44
|
+
return nil unless category
|
45
|
+
return nil unless idx = @rows.index(category)
|
46
|
+
|
47
|
+
@rows[idx] = category
|
48
|
+
@cols.each_with_index do | col_cat, j |
|
49
|
+
@contents[idx][j] = col_cat.codes.dup.join(category.codes)
|
50
|
+
end
|
51
|
+
return idx
|
52
|
+
end
|
53
|
+
|
54
|
+
# Removes the Category +category+ as a column from the CodeReview. Returns
|
55
|
+
# the index of the removed category, if found, or nil, if not.
|
56
|
+
def remove_col(category)
|
57
|
+
return nil unless category
|
58
|
+
return nil unless idx = @cols.index(category)
|
59
|
+
@cols.delete_at(idx)
|
60
|
+
@contents.each { | row | row.delete_at(idx) }
|
61
|
+
return idx
|
62
|
+
end
|
63
|
+
|
64
|
+
# returns the total number of rows in the CodeReview
|
65
|
+
def number_rows()
|
66
|
+
@rows.length
|
67
|
+
end
|
68
|
+
|
69
|
+
# returns the index of the last row in the CodeReview
|
70
|
+
def last_row()
|
71
|
+
@rows.length - 1
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_row()
|
75
|
+
@rows.each_with_index { | r, i | yield r, i }
|
76
|
+
end
|
77
|
+
|
78
|
+
# appends the category +category+ as the last row. Returns the appended
|
79
|
+
# category if it was successfully added, or nil f not - for example, if
|
80
|
+
def add_row(category)
|
81
|
+
return nil unless category
|
82
|
+
return nil if @rows.include?(category)
|
83
|
+
@rows.push(category)
|
84
|
+
@contents[last_row] = []
|
85
|
+
@cols.each_with_index do | col_cat, j |
|
86
|
+
@contents[last_row][j] = col_cat.codes.dup.join(category.codes)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def update_row(category)
|
92
|
+
return nil unless category
|
93
|
+
return nil unless idx = @rows.index(category)
|
94
|
+
@rows[idx] = category
|
95
|
+
@cols.each_with_index do | col_cat, j |
|
96
|
+
@contents[idx][j] = col_cat.codes.dup.join(category.codes)
|
97
|
+
end
|
98
|
+
return idx
|
99
|
+
end
|
100
|
+
|
101
|
+
# Removes the Category +category+ from the rows of this CodeReview.
|
102
|
+
# Returns the index of the corresponding category, if found, or nil, if not.
|
103
|
+
def remove_row(category)
|
104
|
+
return nil unless category
|
105
|
+
return nil unless idx = @rows.index(category)
|
106
|
+
@rows.delete_at(idx)
|
107
|
+
@contents.delete_at(idx)
|
108
|
+
return idx
|
109
|
+
end
|
110
|
+
|
111
|
+
def each_cell()
|
112
|
+
0.upto(last_row) do | i |
|
113
|
+
0.upto(last_col) { | j | yield i, j, @contents[i][j] }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
# loops over the contents of this code review, yielding each cell's location
|
119
|
+
# and value (calculated by +meth+, defaulting to the code review's current
|
120
|
+
# +count_method+. Values are yielded as follows
|
121
|
+
#
|
122
|
+
# code_review.each_cell { | row_num, col_num, cell_value |
|
123
|
+
def each_cell_value(meth = @count_method)
|
124
|
+
each_cell { | i, j, cell | yield i, j, cell.send(meth) }
|
125
|
+
end
|
126
|
+
|
127
|
+
# returns the maximum value among the codereview contents using the metric
|
128
|
+
# +method+ - which should be a method called upon QDA::CodingTable
|
129
|
+
def max(meth = @count_method)
|
130
|
+
@contents.flatten.collect { | x | x.send(meth) }.max
|
131
|
+
end
|
132
|
+
|
133
|
+
# returns the minimum value among the codereview contents using the metric
|
134
|
+
# +method+ - which should be a method called upon QDA::CodingTable
|
135
|
+
def min(meth = @count_method)
|
136
|
+
@contents.flatten.collect { | x | x.send(meth) }.min
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the current content as a series of rows; if +with_array+ is true,
|
140
|
+
# a header row of column names will be the first row, and each subsequent
|
141
|
+
# row will have the name of the row as the first entry.
|
142
|
+
def output_rows(with_header = true)
|
143
|
+
out_rows = []
|
144
|
+
out_rows << [ '', *cols.map { | cat | cat.name } ] if with_header
|
145
|
+
each_row do | row, i |
|
146
|
+
this_row = contents[i].map { | isect | isect.send(count_method) }
|
147
|
+
this_row.unshift(row.name) if with_header
|
148
|
+
out_rows.push(this_row)
|
149
|
+
end
|
150
|
+
out_rows
|
151
|
+
end
|
152
|
+
|
153
|
+
def to_query(app, x, y)
|
154
|
+
return nil unless rows[x] and cols[y]
|
155
|
+
query = Query.new( Query::CodedByFunction.new(app, rows[x]) )
|
156
|
+
query.add_expression( 'AND', Query::CodedByFunction.new(app, cols[y]) )
|
157
|
+
query
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
data/lib/weft/coding.rb
CHANGED
@@ -141,6 +141,22 @@ module QDA
|
|
141
141
|
end
|
142
142
|
super(arr)
|
143
143
|
end
|
144
|
+
|
145
|
+
def items
|
146
|
+
self
|
147
|
+
end
|
148
|
+
|
149
|
+
def docid
|
150
|
+
first ? first.docid : nil
|
151
|
+
end
|
152
|
+
|
153
|
+
def title
|
154
|
+
first ? first.title : nil
|
155
|
+
end
|
156
|
+
|
157
|
+
def num_of_chars()
|
158
|
+
inject(0) { | total, code| total += code.length }
|
159
|
+
end
|
144
160
|
|
145
161
|
# iterate over each successive neighbouring pair of codings in
|
146
162
|
# the set, i.e. items 1, 2; items 2,3; items 3, 4 .. items n-1,
|
@@ -243,18 +259,17 @@ module QDA
|
|
243
259
|
self[item.docid].subtract(item)
|
244
260
|
end
|
245
261
|
|
246
|
-
def num_of_docs
|
262
|
+
def num_of_docs()
|
247
263
|
keys.reject { | set | self[set].length == 0 }.length
|
248
264
|
end
|
249
265
|
|
250
|
-
def num_of_codes
|
266
|
+
def num_of_codes()
|
251
267
|
values.inject(0) { | count, codeset | count + codeset.length }
|
252
268
|
end
|
269
|
+
alias :num_of_passages :num_of_codes
|
253
270
|
|
254
|
-
def num_of_chars
|
255
|
-
values.inject(0)
|
256
|
-
codes.inject(total) { | sub_total, code | sub_total + code.length }
|
257
|
-
end
|
271
|
+
def num_of_chars()
|
272
|
+
values.inject(0) { | count, codeset | count += codeset.num_of_chars }
|
258
273
|
end
|
259
274
|
|
260
275
|
# returns true if this coding table contains coding for the
|
@@ -266,40 +281,51 @@ module QDA
|
|
266
281
|
# Adds the coding of the other coding table +other+ to this one,
|
267
282
|
# modifying +self in place
|
268
283
|
def merge(other)
|
269
|
-
results =
|
284
|
+
results = self.class.new()
|
270
285
|
either = self.keys + other.keys
|
271
286
|
either.uniq.each do | docid |
|
272
287
|
if ! other[docid]
|
273
|
-
results
|
288
|
+
results.set(docid, self[docid])
|
274
289
|
elsif ! self[docid]
|
275
|
-
results
|
290
|
+
results.set(docid, other[docid])
|
276
291
|
else
|
277
|
-
results
|
292
|
+
results.set( docid, self[docid].union(other[docid]) )
|
278
293
|
end
|
279
294
|
end
|
280
|
-
|
295
|
+
return results
|
281
296
|
end
|
282
|
-
|
297
|
+
|
298
|
+
def merge!(other)
|
299
|
+
replace( merge(other) )
|
300
|
+
end
|
301
|
+
|
283
302
|
# Removes all coding from this table that occurs in the other table
|
284
303
|
# +other+, modifying this CodingTable in place
|
285
304
|
def remove(other)
|
286
|
-
results =
|
305
|
+
results = self.class.new()
|
287
306
|
each do | docid, codes |
|
288
|
-
results
|
307
|
+
results.set(docid, codes.exclude( other[docid] ) )
|
289
308
|
end
|
290
|
-
|
309
|
+
return results
|
291
310
|
end
|
292
311
|
|
312
|
+
def remove!(other)
|
313
|
+
replace( remove(other) )
|
314
|
+
end
|
293
315
|
# deletes all coding except that which is also covered by +other+
|
294
316
|
def join(other)
|
295
317
|
both = keys.find_all { | doc | other.key?(doc) }
|
296
|
-
results =
|
318
|
+
results = self.class.new()
|
297
319
|
both.each do | docid |
|
298
|
-
results
|
320
|
+
results.set(docid, self[docid].intersect( other[docid] ) )
|
299
321
|
end
|
300
|
-
|
322
|
+
return results
|
301
323
|
end
|
302
324
|
|
325
|
+
def join!
|
326
|
+
replace( join(other) )
|
327
|
+
end
|
328
|
+
|
303
329
|
def sort(&block)
|
304
330
|
if block_given
|
305
331
|
super(&block)
|
@@ -307,6 +333,14 @@ module QDA
|
|
307
333
|
super { | a, b | a <=> b }
|
308
334
|
end
|
309
335
|
end
|
336
|
+
|
337
|
+
def sets()
|
338
|
+
values_at( *keys.sort )
|
339
|
+
end
|
340
|
+
|
341
|
+
def each_set()
|
342
|
+
keys.sort.each { | docid | yield self[docid] }
|
343
|
+
end
|
310
344
|
end
|
311
345
|
|
312
346
|
# a FragmentTable holds a collection of fragments. It contains a
|
@@ -327,7 +361,14 @@ module QDA
|
|
327
361
|
def [](k)
|
328
362
|
k.kind_of?(String) ? super(@titles[k]) : super(k)
|
329
363
|
end
|
330
|
-
|
364
|
+
|
365
|
+
def set(docid, fragset)
|
366
|
+
super(docid, fragset)
|
367
|
+
if fragset[0] and fragset[0].respond_to?(:doctitle)
|
368
|
+
@titles[fragset[0].doctitle] = fragset[0].docid
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
331
372
|
# Always use this method to add fragments to the collection
|
332
373
|
def add(fragment)
|
333
374
|
unless fragment.is_a?(Fragment)
|
@@ -337,11 +378,21 @@ module QDA
|
|
337
378
|
@titles[fragment.doctitle] = fragment.docid
|
338
379
|
end
|
339
380
|
|
381
|
+
def titles()
|
382
|
+
@titles.keys.sort
|
383
|
+
end
|
384
|
+
|
340
385
|
def each_title()
|
341
|
-
titles
|
342
|
-
|
343
|
-
|
344
|
-
|
386
|
+
titles.each { | title | yield title, self[ @titles[title] ] }
|
387
|
+
end
|
388
|
+
|
389
|
+
def sets
|
390
|
+
docids = titles.map { | t | @titles[t] }
|
391
|
+
values_at( *docids )
|
392
|
+
end
|
393
|
+
|
394
|
+
def each_set
|
395
|
+
titles.each { | title | yield self[ @titles[title] ] }
|
345
396
|
end
|
346
397
|
|
347
398
|
def to_codingtable()
|
data/lib/weft/document.rb
CHANGED
@@ -27,7 +27,15 @@ class Fragment < String
|
|
27
27
|
# of the document - duplicates role of doctitle - to fix
|
28
28
|
@docid = docid
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
|
+
def title
|
32
|
+
@doctitle
|
33
|
+
end
|
34
|
+
|
35
|
+
def text
|
36
|
+
self.to_s()
|
37
|
+
end
|
38
|
+
|
31
39
|
def ==(other)
|
32
40
|
super(other) and
|
33
41
|
@offset == other.offset and
|
@@ -61,6 +69,13 @@ class Fragment < String
|
|
61
69
|
@doctitle, abs, @docid )
|
62
70
|
end
|
63
71
|
|
72
|
+
def scan(pattern)
|
73
|
+
super do | m |
|
74
|
+
yield Fragment.new(m, @doctitle,
|
75
|
+
offset + Regexp.last_match.begin(0), @dbid )
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
64
79
|
def inspect()
|
65
80
|
str = length < 50 ? self.to_s : self.to_s[0, 50] << '...'
|
66
81
|
"<*Fragment #{docid} #{offset}-#{self.end} : '#{str}>"
|
@@ -72,18 +87,15 @@ class Document < Fragment
|
|
72
87
|
attr_accessor :title, :memo
|
73
88
|
|
74
89
|
# expects dbid to be set later
|
75
|
-
def initialize(title, text = '', memo = '',
|
76
|
-
|
90
|
+
def initialize( title, text = '', memo = '',
|
91
|
+
create_date = Time.now(),
|
92
|
+
mod_date = Time.now() )
|
77
93
|
super(text, title, 0)
|
78
94
|
@title = title
|
79
95
|
@memo = memo
|
80
|
-
|
96
|
+
|
81
97
|
@create_date = create_date
|
82
|
-
@mod_date = mod_date
|
83
|
-
end
|
84
|
-
|
85
|
-
def text
|
86
|
-
self.to_s
|
98
|
+
@mod_date = mod_date
|
87
99
|
end
|
88
100
|
|
89
101
|
def dbid=(dbid)
|
@@ -98,7 +110,7 @@ class Document < Fragment
|
|
98
110
|
def create()
|
99
111
|
@create_date = Time.now()
|
100
112
|
end
|
101
|
-
|
113
|
+
|
102
114
|
# def append(text, fragtype = 0)
|
103
115
|
# returns the number of characters appended
|
104
116
|
def append(text, term_char = "\n")
|
@@ -114,5 +126,6 @@ class Document < Fragment
|
|
114
126
|
def inspect()
|
115
127
|
"<*Document #{dbid} '#{title}' (#{length} chars)>"
|
116
128
|
end
|
129
|
+
|
117
130
|
end
|
118
131
|
end
|
data/lib/weft/filters.rb
CHANGED
@@ -3,241 +3,64 @@ require 'weft/coding'
|
|
3
3
|
require 'English'
|
4
4
|
|
5
5
|
module QDA
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def add_indexer(indexer)
|
15
|
-
unless indexer.respond_to?(:feed)
|
16
|
-
raise "Document indexers should have a feed method"
|
17
|
-
end
|
18
|
-
@indexers.push(indexer)
|
19
|
-
end
|
20
|
-
|
21
|
-
# reads +file+ and creates a new document titled +doctitle+. +file+
|
22
|
-
# may be a String filename or an open stream.
|
23
|
-
# Under the hood, calls +read_content+ to extract the content. This
|
24
|
-
# method must be implemented in subclasses. Then +process_content+
|
25
|
-
# is called to create the documents text. This class does something
|
26
|
-
# reasonable with plain text, but structured text formats will want
|
27
|
-
# to subclass this method to process non-text information (for
|
28
|
-
# example, HTML or XML tags)
|
29
|
-
def read(file, doctitle)
|
30
|
-
@content = ''
|
31
|
-
case file
|
32
|
-
when IO
|
33
|
-
@content = file.read()
|
34
|
-
when QDA::Document
|
35
|
-
@content = file.text()
|
36
|
-
when String
|
37
|
-
@content = File.read(file)
|
38
|
-
end
|
39
|
-
process_content(doctitle)
|
40
|
-
end
|
41
|
-
|
42
|
-
def process_content(doctitle)
|
43
|
-
# signal to indexers we're about to start
|
44
|
-
@indexers.each { | indexer | indexer.prepare(@content) }
|
45
|
-
doc = QDA::Document.new(doctitle)
|
46
|
-
@content.each_line do | line |
|
47
|
-
doc.append(line.to_s.chomp)
|
48
|
-
# inform AutoCoders, reverse indexers and so on.
|
49
|
-
@indexers.each { | indexer | indexer.feed(line) }
|
50
|
-
end
|
51
|
-
@indexers.each { | indexer | indexer.terminate() }
|
52
|
-
doc.create
|
53
|
-
return doc
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class TextFilter < InputFilter
|
58
|
-
EXTENSIONS = [ 'txt' ]
|
59
|
-
def read_content(file)
|
60
|
-
text = file.read()
|
61
|
-
file.close()
|
62
|
-
text
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
class PDFFilter < InputFilter
|
67
|
-
EXTENSIONS = [ 'pdf' ]
|
68
|
-
PDF_TO_TEXT_EXEC = 'pdftotext'
|
69
|
-
begin
|
70
|
-
out = `#{PDF_TO_TEXT_EXEC} -v 2>&1`
|
71
|
-
unless out =~ /pdftotext version 3/
|
72
|
-
warn 'PDFtotext Version 3 not found in path' +
|
73
|
-
'PDF Filters will not be avaialabl'
|
6
|
+
module Filters
|
7
|
+
@@import = Hash.new { | h, k | h[k] = [] }
|
8
|
+
@@export = Hash.new { | h, k | h[k] = [] }
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def register_filter( filter_class )
|
12
|
+
if defined? filter_class::IMPORT_CLASS
|
13
|
+
@@import[filter_class::IMPORT_CLASS].push(filter_class)
|
74
14
|
end
|
75
|
-
|
76
|
-
|
77
|
-
NO_COPYING_ERROR_TEXT =
|
78
|
-
"The author or publisher of this PDF document has locked it to
|
79
|
-
prevent copying and extraction of its text. It is not possible to
|
80
|
-
import this document."
|
81
|
-
def read(file, doctitle)
|
82
|
-
case file
|
83
|
-
when IO
|
84
|
-
raise NotImplementedError
|
85
|
-
@content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file.path} - 2>&1`
|
86
|
-
file.close()
|
87
|
-
when String
|
88
|
-
@content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file} - 2>&1`
|
89
|
-
end
|
90
|
-
|
91
|
-
case $CHILD_STATUS
|
92
|
-
when 0
|
93
|
-
process_content(doctitle)
|
94
|
-
when 3
|
95
|
-
raise RuntimeError.new(NO_COPYING_ERROR_TEXT)
|
96
|
-
else
|
97
|
-
raise RuntimeError.new("Could not extract PDF text: #{text}")
|
15
|
+
if defined? filter_class::EXPORT_CLASS
|
16
|
+
@@export[filter_class::EXPORT_CLASS].push(filter_class)
|
98
17
|
end
|
99
18
|
end
|
100
19
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
# ...
|
108
|
-
class HTMLFilter < OutputFilter
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
class Indexer
|
113
|
-
attr_reader :cursor
|
114
|
-
def initialize()
|
115
|
-
@cursor = 0
|
20
|
+
# imports an object of class +klass+ e.g. QDA::Document from the file
|
21
|
+
# +filename+, which should be a string.
|
22
|
+
def import_file(klass, filename, opts = {}, &block)
|
23
|
+
ext = filename[-3,3]
|
24
|
+
filter = Filters.find_import_filter(klass, ext).new()
|
25
|
+
import(filter, filename, &block)
|
116
26
|
end
|
117
|
-
|
118
|
-
def
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
def terminate()
|
124
|
-
end
|
125
|
-
|
126
|
-
def prepare(content)
|
127
|
-
end
|
128
|
-
|
129
|
-
def feed(line)
|
130
|
-
@cursor += line.length
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# An indexer which records the position of words for later reverse
|
135
|
-
# retrieval
|
136
|
-
class WordIndexer < Indexer
|
137
|
-
attr_reader :words
|
138
|
-
# includes accented latin-1 characters
|
139
|
-
WORD_TOKENIZER = /[\w\xC0-\xD6\xD8-\xF6\xF8-\xFF][\w\xC0-\xD6\xD8-\xF6\xF8-\xFF\']+/
|
140
|
-
def initialize()
|
141
|
-
super
|
142
|
-
@words = Hash.new { | h, k | h[k] = [] }
|
143
|
-
end
|
144
|
-
|
145
|
-
def feed(line)
|
146
|
-
line.scan( WORD_TOKENIZER ) do | word |
|
147
|
-
next if word.length == 1
|
148
|
-
@words[word].push(cursor + Regexp.last_match.begin(0))
|
149
|
-
end
|
150
|
-
super
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# An indexer that uses text patterns to identify, for example,
|
155
|
-
# passages by a particular speaker, or text headings.
|
156
|
-
# The indexer can recognise a number of different types of codes,
|
157
|
-
# each denoted by a pattern of punctuation in a line of text. A
|
158
|
-
# default coder recognises the following
|
159
|
-
# A 'Heading', marked by a line **NAME OF HEADING**
|
160
|
-
# A 'Speaker', marked by a line SpeakerName:
|
161
|
-
#
|
162
|
-
# After the filter has run, the results of the coding can be
|
163
|
-
# retrieved by calling Autocoder#codes
|
164
|
-
# This is a hash of codetype names to inner hashes of codevalue names
|
165
|
-
# (strings) to QDA::Codesets corresponding to them.
|
166
|
-
class AutoCoder < Indexer
|
167
|
-
STANDARD_TRIGGER_RULES = {
|
168
|
-
/^(\w+)\:\s*$/ => 'Speaker',
|
169
|
-
/^\*\*(.*)\*\*$/ => 'Heading'
|
170
|
-
}
|
171
|
-
|
172
|
-
attr_reader :codes
|
173
|
-
# +rules+ should be a hash of string keys, naming types of autocode
|
174
|
-
# (e.g. "Speaker", "Heading", "Topic") mapped to values, which
|
175
|
-
# should be regular expressions specifying how the start of such a
|
176
|
-
# code should be recognised.
|
177
|
-
# For example, to find topics marked by the characters '##' at the
|
178
|
-
# start of the line:
|
179
|
-
# 'Heading' => /^##(.*)$/
|
180
|
-
def initialize(rules = STANDARD_TRIGGER_RULES)
|
181
|
-
super()
|
182
|
-
@trigger_rules = rules
|
183
|
-
@codes = Hash.new { | h, k | h[k] = {} }
|
184
|
-
@curr_codes = {}
|
185
|
-
end
|
186
|
-
|
187
|
-
# check a line of document content for triggers
|
188
|
-
def feed(line)
|
189
|
-
@trigger_rules.each do | rule, type |
|
190
|
-
if match = rule.match(line)
|
191
|
-
trigger(cursor, type, match[1])
|
192
|
-
end
|
193
|
-
end
|
194
|
-
super
|
27
|
+
|
28
|
+
def import(filter, content)
|
29
|
+
obj = filter.run(content)
|
30
|
+
yield obj, filter if block_given?
|
31
|
+
obj
|
195
32
|
end
|
196
|
-
|
197
|
-
#
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
new_codeset = get_code(group, codename)
|
202
|
-
@curr_codes[group] = [ new_codeset, cursor ]
|
33
|
+
|
34
|
+
# Returns a hash of all available import filter types, keyed on Weft
|
35
|
+
# classes (eg Document
|
36
|
+
def import_filters()
|
37
|
+
@@import
|
203
38
|
end
|
204
|
-
|
205
|
-
|
206
|
-
#
|
207
|
-
#
|
208
|
-
def
|
209
|
-
|
210
|
-
@codes[group][codename] = QDA::CodeSet.new()
|
39
|
+
|
40
|
+
|
41
|
+
# Returns a hash of all available export filter types, keyed on Weft
|
42
|
+
# classes (eg Document)
|
43
|
+
def export_filters()
|
44
|
+
@@export
|
211
45
|
end
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
def each_autocode(group)
|
216
|
-
@codes[group].each { | name, codeset | yield name, codeset }
|
46
|
+
|
47
|
+
def find_import_filter( weft_class, ext )
|
48
|
+
@@import[weft_class].find { | filter | filter::EXTENSIONS.include?(ext) }
|
217
49
|
end
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
def apply(docid)
|
222
|
-
@codes.values.each do | group |
|
223
|
-
group.values.each do | codeset |
|
224
|
-
codeset.map! { | x | x.docid = docid; x }
|
225
|
-
end
|
226
|
-
end
|
50
|
+
|
51
|
+
def find_export_filter( weft_class, ext )
|
52
|
+
@@export[weft_class].find { | filter | filter::EXTENSION == ext }
|
227
53
|
end
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
@curr_codes.each_key { | group | store(group) }
|
54
|
+
|
55
|
+
def can_export?(weft_class)
|
56
|
+
@@export.has_key?(weft_class)
|
232
57
|
end
|
233
|
-
|
234
|
-
# finish the coding for the current code being used among +group+
|
235
|
-
def store(group)
|
236
|
-
codeset, start = @curr_codes[group]
|
237
|
-
# -1 here is a placeholder
|
238
|
-
terminus = cursor - start
|
239
|
-
codeset.add( Code.new(-1, start, terminus) )
|
58
|
+
|
240
59
|
end
|
241
|
-
private :store
|
242
60
|
end
|
61
|
+
|
62
|
+
require 'weft/filters/indexers'
|
63
|
+
require 'weft/filters/output'
|
64
|
+
require 'weft/filters/input'
|
65
|
+
require 'weft/filters/templates'
|
243
66
|
end
|