weft-qda 0.9.6 → 0.9.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/weft.rb +16 -1
- data/lib/weft/WEFT-VERSION-STRING.rb +1 -1
- data/lib/weft/application.rb +17 -74
- data/lib/weft/backend.rb +6 -32
- data/lib/weft/backend/sqlite.rb +222 -164
- data/lib/weft/backend/sqlite/category_tree.rb +52 -48
- data/lib/weft/backend/sqlite/database.rb +57 -0
- data/lib/weft/backend/sqlite/upgradeable.rb +7 -0
- data/lib/weft/broadcaster.rb +90 -0
- data/lib/weft/category.rb +139 -47
- data/lib/weft/codereview.rb +160 -0
- data/lib/weft/coding.rb +74 -23
- data/lib/weft/document.rb +23 -10
- data/lib/weft/exceptions.rb +10 -0
- data/lib/weft/filters.rb +47 -224
- data/lib/weft/filters/indexers.rb +137 -0
- data/lib/weft/filters/input.rb +118 -0
- data/lib/weft/filters/output.rb +101 -0
- data/lib/weft/filters/templates.rb +80 -0
- data/lib/weft/filters/win32backtick.rb +246 -0
- data/lib/weft/query.rb +169 -0
- data/lib/weft/wxgui.rb +349 -294
- data/lib/weft/wxgui/constants.rb +43 -0
- data/lib/weft/wxgui/controls.rb +6 -0
- data/lib/weft/wxgui/controls/category_dropdown.rb +192 -0
- data/lib/weft/wxgui/controls/category_tree.rb +314 -0
- data/lib/weft/wxgui/controls/document_list.rb +97 -0
- data/lib/weft/wxgui/controls/multitype_control.rb +37 -0
- data/lib/weft/wxgui/{inspectors → controls}/textcontrols.rb +235 -64
- data/lib/weft/wxgui/dialogs.rb +144 -41
- data/lib/weft/wxgui/error_handler.rb +116 -36
- data/lib/weft/wxgui/exceptions.rb +7 -0
- data/lib/weft/wxgui/inspectors.rb +61 -208
- data/lib/weft/wxgui/inspectors/category.rb +19 -16
- data/lib/weft/wxgui/inspectors/codereview.rb +90 -132
- data/lib/weft/wxgui/inspectors/document.rb +12 -8
- data/lib/weft/wxgui/inspectors/imagedocument.rb +56 -56
- data/lib/weft/wxgui/inspectors/query.rb +284 -0
- data/lib/weft/wxgui/inspectors/script.rb +147 -23
- data/lib/weft/wxgui/lang/en.rb +69 -0
- data/lib/weft/wxgui/sidebar.rb +90 -432
- data/lib/weft/wxgui/utilities.rb +70 -91
- data/lib/weft/wxgui/workarea.rb +150 -43
- data/share/icons/category.ico +0 -0
- data/share/icons/category.xpm +109 -0
- data/share/icons/codereview.ico +0 -0
- data/share/icons/codereview.xpm +54 -0
- data/share/icons/d_and_c.xpm +126 -0
- data/share/icons/document.ico +0 -0
- data/share/icons/document.xpm +70 -0
- data/share/icons/project.ico +0 -0
- data/share/icons/query.ico +0 -0
- data/share/icons/query.xpm +56 -0
- data/{lib/weft/wxgui → share/icons}/search.xpm +0 -0
- data/share/icons/weft.ico +0 -0
- data/share/icons/weft.xpm +62 -0
- data/share/icons/weft16.ico +0 -0
- data/share/icons/weft32.ico +0 -0
- data/share/templates/category_plain.html +18 -0
- data/share/templates/codereview_plain.html +18 -0
- data/share/templates/document_plain.html +13 -0
- data/share/templates/document_plain.txt +7 -0
- data/test/001-document.rb +55 -36
- data/test/002-category.rb +81 -6
- data/test/003-code.rb +8 -4
- data/test/004-application.rb +13 -34
- data/test/005-query_review.rb +139 -0
- data/test/006-filters.rb +54 -42
- data/test/007-output_filters.rb +113 -0
- data/test/009a-backend_sqlite_basic.rb +95 -24
- data/test/009b-backend_sqlite_complex.rb +43 -62
- data/test/009c_backend_sqlite_bench.rb +5 -10
- data/test/053-doc_inspector.rb +46 -0
- data/test/055-query_window.rb +50 -0
- data/test/all-tests.rb +1 -0
- data/test/test-common.rb +19 -0
- data/test/testdata/empty.qdp +0 -0
- data/test/testdata/simple with space.pdf +0 -0
- data/test/testdata/simple.pdf +0 -0
- data/weft-qda.rb +40 -7
- metadata +74 -14
- data/lib/weft/wxgui/category.xpm +0 -26
- data/lib/weft/wxgui/document.xpm +0 -25
- data/lib/weft/wxgui/inspectors/search.rb +0 -265
- data/lib/weft/wxgui/mondrian.xpm +0 -44
- data/lib/weft/wxgui/weft16.xpm +0 -31
@@ -0,0 +1,160 @@
|
|
1
|
+
module QDA
|
2
|
+
# CodeReview is a class that is used for cross-tabulation of coding. It makes
|
3
|
+
# it possible to get statistics for the number of characters, passages and
|
4
|
+
# documents that are coded by both the row column and the
|
5
|
+
class CodeReview
|
6
|
+
attr_accessor :dbid, :count_method
|
7
|
+
attr_reader :cols, :rows, :contents
|
8
|
+
|
9
|
+
# A new CodeReview is empty when initialised
|
10
|
+
def initialize()
|
11
|
+
@cols, @rows, @contents = [], [], []
|
12
|
+
@count_method = :num_of_docs
|
13
|
+
end
|
14
|
+
|
15
|
+
# returns the total number of columns
|
16
|
+
def number_cols()
|
17
|
+
@cols.length
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns the index of the last column
|
21
|
+
def last_col()
|
22
|
+
@cols.length - 1
|
23
|
+
end
|
24
|
+
|
25
|
+
# takes a block, yielding each column Category and its index in turn
|
26
|
+
def each_col()
|
27
|
+
@cols.each_with_index { | col, i | yield col, i }
|
28
|
+
end
|
29
|
+
|
30
|
+
# add the Category +category+ as the last column
|
31
|
+
def add_col(category)
|
32
|
+
return nil unless category
|
33
|
+
return nil if @cols.include?(category)
|
34
|
+
@cols.push(category)
|
35
|
+
|
36
|
+
@rows.each_with_index do | row_cat, i |
|
37
|
+
@contents[i][last_col] = row_cat.codes.dup.join(category.codes)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Updates the column with the changed Category +category+. Useful in a
|
42
|
+
# persistent environment where user actions may have altered the coding.
|
43
|
+
def update_col(category)
|
44
|
+
return nil unless category
|
45
|
+
return nil unless idx = @rows.index(category)
|
46
|
+
|
47
|
+
@rows[idx] = category
|
48
|
+
@cols.each_with_index do | col_cat, j |
|
49
|
+
@contents[idx][j] = col_cat.codes.dup.join(category.codes)
|
50
|
+
end
|
51
|
+
return idx
|
52
|
+
end
|
53
|
+
|
54
|
+
# Removes the Category +category+ as a column from the CodeReview. Returns
|
55
|
+
# the index of the removed category, if found, or nil, if not.
|
56
|
+
def remove_col(category)
|
57
|
+
return nil unless category
|
58
|
+
return nil unless idx = @cols.index(category)
|
59
|
+
@cols.delete_at(idx)
|
60
|
+
@contents.each { | row | row.delete_at(idx) }
|
61
|
+
return idx
|
62
|
+
end
|
63
|
+
|
64
|
+
# returns the total number of rows in the CodeReview
|
65
|
+
def number_rows()
|
66
|
+
@rows.length
|
67
|
+
end
|
68
|
+
|
69
|
+
# returns the index of the last row in the CodeReview
|
70
|
+
def last_row()
|
71
|
+
@rows.length - 1
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_row()
|
75
|
+
@rows.each_with_index { | r, i | yield r, i }
|
76
|
+
end
|
77
|
+
|
78
|
+
# appends the category +category+ as the last row. Returns the appended
|
79
|
+
# category if it was successfully added, or nil f not - for example, if
|
80
|
+
def add_row(category)
|
81
|
+
return nil unless category
|
82
|
+
return nil if @rows.include?(category)
|
83
|
+
@rows.push(category)
|
84
|
+
@contents[last_row] = []
|
85
|
+
@cols.each_with_index do | col_cat, j |
|
86
|
+
@contents[last_row][j] = col_cat.codes.dup.join(category.codes)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def update_row(category)
|
92
|
+
return nil unless category
|
93
|
+
return nil unless idx = @rows.index(category)
|
94
|
+
@rows[idx] = category
|
95
|
+
@cols.each_with_index do | col_cat, j |
|
96
|
+
@contents[idx][j] = col_cat.codes.dup.join(category.codes)
|
97
|
+
end
|
98
|
+
return idx
|
99
|
+
end
|
100
|
+
|
101
|
+
# Removes the Category +category+ from the rows of this CodeReview.
|
102
|
+
# Returns the index of the corresponding category, if found, or nil, if not.
|
103
|
+
def remove_row(category)
|
104
|
+
return nil unless category
|
105
|
+
return nil unless idx = @rows.index(category)
|
106
|
+
@rows.delete_at(idx)
|
107
|
+
@contents.delete_at(idx)
|
108
|
+
return idx
|
109
|
+
end
|
110
|
+
|
111
|
+
def each_cell()
|
112
|
+
0.upto(last_row) do | i |
|
113
|
+
0.upto(last_col) { | j | yield i, j, @contents[i][j] }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
# loops over the contents of this code review, yielding each cell's location
|
119
|
+
# and value (calculated by +meth+, defaulting to the code review's current
|
120
|
+
# +count_method+. Values are yielded as follows
|
121
|
+
#
|
122
|
+
# code_review.each_cell { | row_num, col_num, cell_value |
|
123
|
+
def each_cell_value(meth = @count_method)
|
124
|
+
each_cell { | i, j, cell | yield i, j, cell.send(meth) }
|
125
|
+
end
|
126
|
+
|
127
|
+
# returns the maximum value among the codereview contents using the metric
|
128
|
+
# +method+ - which should be a method called upon QDA::CodingTable
|
129
|
+
def max(meth = @count_method)
|
130
|
+
@contents.flatten.collect { | x | x.send(meth) }.max
|
131
|
+
end
|
132
|
+
|
133
|
+
# returns the minimum value among the codereview contents using the metric
|
134
|
+
# +method+ - which should be a method called upon QDA::CodingTable
|
135
|
+
def min(meth = @count_method)
|
136
|
+
@contents.flatten.collect { | x | x.send(meth) }.min
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the current content as a series of rows; if +with_array+ is true,
|
140
|
+
# a header row of column names will be the first row, and each subsequent
|
141
|
+
# row will have the name of the row as the first entry.
|
142
|
+
def output_rows(with_header = true)
|
143
|
+
out_rows = []
|
144
|
+
out_rows << [ '', *cols.map { | cat | cat.name } ] if with_header
|
145
|
+
each_row do | row, i |
|
146
|
+
this_row = contents[i].map { | isect | isect.send(count_method) }
|
147
|
+
this_row.unshift(row.name) if with_header
|
148
|
+
out_rows.push(this_row)
|
149
|
+
end
|
150
|
+
out_rows
|
151
|
+
end
|
152
|
+
|
153
|
+
def to_query(app, x, y)
|
154
|
+
return nil unless rows[x] and cols[y]
|
155
|
+
query = Query.new( Query::CodedByFunction.new(app, rows[x]) )
|
156
|
+
query.add_expression( 'AND', Query::CodedByFunction.new(app, cols[y]) )
|
157
|
+
query
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
data/lib/weft/coding.rb
CHANGED
@@ -141,6 +141,22 @@ module QDA
|
|
141
141
|
end
|
142
142
|
super(arr)
|
143
143
|
end
|
144
|
+
|
145
|
+
def items
|
146
|
+
self
|
147
|
+
end
|
148
|
+
|
149
|
+
def docid
|
150
|
+
first ? first.docid : nil
|
151
|
+
end
|
152
|
+
|
153
|
+
def title
|
154
|
+
first ? first.title : nil
|
155
|
+
end
|
156
|
+
|
157
|
+
def num_of_chars()
|
158
|
+
inject(0) { | total, code| total += code.length }
|
159
|
+
end
|
144
160
|
|
145
161
|
# iterate over each successive neighbouring pair of codings in
|
146
162
|
# the set, i.e. items 1, 2; items 2,3; items 3, 4 .. items n-1,
|
@@ -243,18 +259,17 @@ module QDA
|
|
243
259
|
self[item.docid].subtract(item)
|
244
260
|
end
|
245
261
|
|
246
|
-
def num_of_docs
|
262
|
+
def num_of_docs()
|
247
263
|
keys.reject { | set | self[set].length == 0 }.length
|
248
264
|
end
|
249
265
|
|
250
|
-
def num_of_codes
|
266
|
+
def num_of_codes()
|
251
267
|
values.inject(0) { | count, codeset | count + codeset.length }
|
252
268
|
end
|
269
|
+
alias :num_of_passages :num_of_codes
|
253
270
|
|
254
|
-
def num_of_chars
|
255
|
-
values.inject(0)
|
256
|
-
codes.inject(total) { | sub_total, code | sub_total + code.length }
|
257
|
-
end
|
271
|
+
def num_of_chars()
|
272
|
+
values.inject(0) { | count, codeset | count += codeset.num_of_chars }
|
258
273
|
end
|
259
274
|
|
260
275
|
# returns true if this coding table contains coding for the
|
@@ -266,40 +281,51 @@ module QDA
|
|
266
281
|
# Adds the coding of the other coding table +other+ to this one,
|
267
282
|
# modifying +self in place
|
268
283
|
def merge(other)
|
269
|
-
results =
|
284
|
+
results = self.class.new()
|
270
285
|
either = self.keys + other.keys
|
271
286
|
either.uniq.each do | docid |
|
272
287
|
if ! other[docid]
|
273
|
-
results
|
288
|
+
results.set(docid, self[docid])
|
274
289
|
elsif ! self[docid]
|
275
|
-
results
|
290
|
+
results.set(docid, other[docid])
|
276
291
|
else
|
277
|
-
results
|
292
|
+
results.set( docid, self[docid].union(other[docid]) )
|
278
293
|
end
|
279
294
|
end
|
280
|
-
|
295
|
+
return results
|
281
296
|
end
|
282
|
-
|
297
|
+
|
298
|
+
def merge!(other)
|
299
|
+
replace( merge(other) )
|
300
|
+
end
|
301
|
+
|
283
302
|
# Removes all coding from this table that occurs in the other table
|
284
303
|
# +other+, modifying this CodingTable in place
|
285
304
|
def remove(other)
|
286
|
-
results =
|
305
|
+
results = self.class.new()
|
287
306
|
each do | docid, codes |
|
288
|
-
results
|
307
|
+
results.set(docid, codes.exclude( other[docid] ) )
|
289
308
|
end
|
290
|
-
|
309
|
+
return results
|
291
310
|
end
|
292
311
|
|
312
|
+
def remove!(other)
|
313
|
+
replace( remove(other) )
|
314
|
+
end
|
293
315
|
# deletes all coding except that which is also covered by +other+
|
294
316
|
def join(other)
|
295
317
|
both = keys.find_all { | doc | other.key?(doc) }
|
296
|
-
results =
|
318
|
+
results = self.class.new()
|
297
319
|
both.each do | docid |
|
298
|
-
results
|
320
|
+
results.set(docid, self[docid].intersect( other[docid] ) )
|
299
321
|
end
|
300
|
-
|
322
|
+
return results
|
301
323
|
end
|
302
324
|
|
325
|
+
def join!
|
326
|
+
replace( join(other) )
|
327
|
+
end
|
328
|
+
|
303
329
|
def sort(&block)
|
304
330
|
if block_given
|
305
331
|
super(&block)
|
@@ -307,6 +333,14 @@ module QDA
|
|
307
333
|
super { | a, b | a <=> b }
|
308
334
|
end
|
309
335
|
end
|
336
|
+
|
337
|
+
def sets()
|
338
|
+
values_at( *keys.sort )
|
339
|
+
end
|
340
|
+
|
341
|
+
def each_set()
|
342
|
+
keys.sort.each { | docid | yield self[docid] }
|
343
|
+
end
|
310
344
|
end
|
311
345
|
|
312
346
|
# a FragmentTable holds a collection of fragments. It contains a
|
@@ -327,7 +361,14 @@ module QDA
|
|
327
361
|
def [](k)
|
328
362
|
k.kind_of?(String) ? super(@titles[k]) : super(k)
|
329
363
|
end
|
330
|
-
|
364
|
+
|
365
|
+
def set(docid, fragset)
|
366
|
+
super(docid, fragset)
|
367
|
+
if fragset[0] and fragset[0].respond_to?(:doctitle)
|
368
|
+
@titles[fragset[0].doctitle] = fragset[0].docid
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
331
372
|
# Always use this method to add fragments to the collection
|
332
373
|
def add(fragment)
|
333
374
|
unless fragment.is_a?(Fragment)
|
@@ -337,11 +378,21 @@ module QDA
|
|
337
378
|
@titles[fragment.doctitle] = fragment.docid
|
338
379
|
end
|
339
380
|
|
381
|
+
def titles()
|
382
|
+
@titles.keys.sort
|
383
|
+
end
|
384
|
+
|
340
385
|
def each_title()
|
341
|
-
titles
|
342
|
-
|
343
|
-
|
344
|
-
|
386
|
+
titles.each { | title | yield title, self[ @titles[title] ] }
|
387
|
+
end
|
388
|
+
|
389
|
+
def sets
|
390
|
+
docids = titles.map { | t | @titles[t] }
|
391
|
+
values_at( *docids )
|
392
|
+
end
|
393
|
+
|
394
|
+
def each_set
|
395
|
+
titles.each { | title | yield self[ @titles[title] ] }
|
345
396
|
end
|
346
397
|
|
347
398
|
def to_codingtable()
|
data/lib/weft/document.rb
CHANGED
@@ -27,7 +27,15 @@ class Fragment < String
|
|
27
27
|
# of the document - duplicates role of doctitle - to fix
|
28
28
|
@docid = docid
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
|
+
def title
|
32
|
+
@doctitle
|
33
|
+
end
|
34
|
+
|
35
|
+
def text
|
36
|
+
self.to_s()
|
37
|
+
end
|
38
|
+
|
31
39
|
def ==(other)
|
32
40
|
super(other) and
|
33
41
|
@offset == other.offset and
|
@@ -61,6 +69,13 @@ class Fragment < String
|
|
61
69
|
@doctitle, abs, @docid )
|
62
70
|
end
|
63
71
|
|
72
|
+
def scan(pattern)
|
73
|
+
super do | m |
|
74
|
+
yield Fragment.new(m, @doctitle,
|
75
|
+
offset + Regexp.last_match.begin(0), @dbid )
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
64
79
|
def inspect()
|
65
80
|
str = length < 50 ? self.to_s : self.to_s[0, 50] << '...'
|
66
81
|
"<*Fragment #{docid} #{offset}-#{self.end} : '#{str}>"
|
@@ -72,18 +87,15 @@ class Document < Fragment
|
|
72
87
|
attr_accessor :title, :memo
|
73
88
|
|
74
89
|
# expects dbid to be set later
|
75
|
-
def initialize(title, text = '', memo = '',
|
76
|
-
|
90
|
+
def initialize( title, text = '', memo = '',
|
91
|
+
create_date = Time.now(),
|
92
|
+
mod_date = Time.now() )
|
77
93
|
super(text, title, 0)
|
78
94
|
@title = title
|
79
95
|
@memo = memo
|
80
|
-
|
96
|
+
|
81
97
|
@create_date = create_date
|
82
|
-
@mod_date = mod_date
|
83
|
-
end
|
84
|
-
|
85
|
-
def text
|
86
|
-
self.to_s
|
98
|
+
@mod_date = mod_date
|
87
99
|
end
|
88
100
|
|
89
101
|
def dbid=(dbid)
|
@@ -98,7 +110,7 @@ class Document < Fragment
|
|
98
110
|
def create()
|
99
111
|
@create_date = Time.now()
|
100
112
|
end
|
101
|
-
|
113
|
+
|
102
114
|
# def append(text, fragtype = 0)
|
103
115
|
# returns the number of characters appended
|
104
116
|
def append(text, term_char = "\n")
|
@@ -114,5 +126,6 @@ class Document < Fragment
|
|
114
126
|
def inspect()
|
115
127
|
"<*Document #{dbid} '#{title}' (#{length} chars)>"
|
116
128
|
end
|
129
|
+
|
117
130
|
end
|
118
131
|
end
|
data/lib/weft/filters.rb
CHANGED
@@ -3,241 +3,64 @@ require 'weft/coding'
|
|
3
3
|
require 'English'
|
4
4
|
|
5
5
|
module QDA
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def add_indexer(indexer)
|
15
|
-
unless indexer.respond_to?(:feed)
|
16
|
-
raise "Document indexers should have a feed method"
|
17
|
-
end
|
18
|
-
@indexers.push(indexer)
|
19
|
-
end
|
20
|
-
|
21
|
-
# reads +file+ and creates a new document titled +doctitle+. +file+
|
22
|
-
# may be a String filename or an open stream.
|
23
|
-
# Under the hood, calls +read_content+ to extract the content. This
|
24
|
-
# method must be implemented in subclasses. Then +process_content+
|
25
|
-
# is called to create the documents text. This class does something
|
26
|
-
# reasonable with plain text, but structured text formats will want
|
27
|
-
# to subclass this method to process non-text information (for
|
28
|
-
# example, HTML or XML tags)
|
29
|
-
def read(file, doctitle)
|
30
|
-
@content = ''
|
31
|
-
case file
|
32
|
-
when IO
|
33
|
-
@content = file.read()
|
34
|
-
when QDA::Document
|
35
|
-
@content = file.text()
|
36
|
-
when String
|
37
|
-
@content = File.read(file)
|
38
|
-
end
|
39
|
-
process_content(doctitle)
|
40
|
-
end
|
41
|
-
|
42
|
-
def process_content(doctitle)
|
43
|
-
# signal to indexers we're about to start
|
44
|
-
@indexers.each { | indexer | indexer.prepare(@content) }
|
45
|
-
doc = QDA::Document.new(doctitle)
|
46
|
-
@content.each_line do | line |
|
47
|
-
doc.append(line.to_s.chomp)
|
48
|
-
# inform AutoCoders, reverse indexers and so on.
|
49
|
-
@indexers.each { | indexer | indexer.feed(line) }
|
50
|
-
end
|
51
|
-
@indexers.each { | indexer | indexer.terminate() }
|
52
|
-
doc.create
|
53
|
-
return doc
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class TextFilter < InputFilter
|
58
|
-
EXTENSIONS = [ 'txt' ]
|
59
|
-
def read_content(file)
|
60
|
-
text = file.read()
|
61
|
-
file.close()
|
62
|
-
text
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
class PDFFilter < InputFilter
|
67
|
-
EXTENSIONS = [ 'pdf' ]
|
68
|
-
PDF_TO_TEXT_EXEC = 'pdftotext'
|
69
|
-
begin
|
70
|
-
out = `#{PDF_TO_TEXT_EXEC} -v 2>&1`
|
71
|
-
unless out =~ /pdftotext version 3/
|
72
|
-
warn 'PDFtotext Version 3 not found in path' +
|
73
|
-
'PDF Filters will not be avaialabl'
|
6
|
+
module Filters
|
7
|
+
@@import = Hash.new { | h, k | h[k] = [] }
|
8
|
+
@@export = Hash.new { | h, k | h[k] = [] }
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def register_filter( filter_class )
|
12
|
+
if defined? filter_class::IMPORT_CLASS
|
13
|
+
@@import[filter_class::IMPORT_CLASS].push(filter_class)
|
74
14
|
end
|
75
|
-
|
76
|
-
|
77
|
-
NO_COPYING_ERROR_TEXT =
|
78
|
-
"The author or publisher of this PDF document has locked it to
|
79
|
-
prevent copying and extraction of its text. It is not possible to
|
80
|
-
import this document."
|
81
|
-
def read(file, doctitle)
|
82
|
-
case file
|
83
|
-
when IO
|
84
|
-
raise NotImplementedError
|
85
|
-
@content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file.path} - 2>&1`
|
86
|
-
file.close()
|
87
|
-
when String
|
88
|
-
@content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file} - 2>&1`
|
89
|
-
end
|
90
|
-
|
91
|
-
case $CHILD_STATUS
|
92
|
-
when 0
|
93
|
-
process_content(doctitle)
|
94
|
-
when 3
|
95
|
-
raise RuntimeError.new(NO_COPYING_ERROR_TEXT)
|
96
|
-
else
|
97
|
-
raise RuntimeError.new("Could not extract PDF text: #{text}")
|
15
|
+
if defined? filter_class::EXPORT_CLASS
|
16
|
+
@@export[filter_class::EXPORT_CLASS].push(filter_class)
|
98
17
|
end
|
99
18
|
end
|
100
19
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
# ...
|
108
|
-
class HTMLFilter < OutputFilter
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
class Indexer
|
113
|
-
attr_reader :cursor
|
114
|
-
def initialize()
|
115
|
-
@cursor = 0
|
20
|
+
# imports an object of class +klass+ e.g. QDA::Document from the file
|
21
|
+
# +filename+, which should be a string.
|
22
|
+
def import_file(klass, filename, opts = {}, &block)
|
23
|
+
ext = filename[-3,3]
|
24
|
+
filter = Filters.find_import_filter(klass, ext).new()
|
25
|
+
import(filter, filename, &block)
|
116
26
|
end
|
117
|
-
|
118
|
-
def
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
def terminate()
|
124
|
-
end
|
125
|
-
|
126
|
-
def prepare(content)
|
127
|
-
end
|
128
|
-
|
129
|
-
def feed(line)
|
130
|
-
@cursor += line.length
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
# An indexer which records the position of words for later reverse
|
135
|
-
# retrieval
|
136
|
-
class WordIndexer < Indexer
|
137
|
-
attr_reader :words
|
138
|
-
# includes accented latin-1 characters
|
139
|
-
WORD_TOKENIZER = /[\w\xC0-\xD6\xD8-\xF6\xF8-\xFF][\w\xC0-\xD6\xD8-\xF6\xF8-\xFF\']+/
|
140
|
-
def initialize()
|
141
|
-
super
|
142
|
-
@words = Hash.new { | h, k | h[k] = [] }
|
143
|
-
end
|
144
|
-
|
145
|
-
def feed(line)
|
146
|
-
line.scan( WORD_TOKENIZER ) do | word |
|
147
|
-
next if word.length == 1
|
148
|
-
@words[word].push(cursor + Regexp.last_match.begin(0))
|
149
|
-
end
|
150
|
-
super
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
# An indexer that uses text patterns to identify, for example,
|
155
|
-
# passages by a particular speaker, or text headings.
|
156
|
-
# The indexer can recognise a number of different types of codes,
|
157
|
-
# each denoted by a pattern of punctuation in a line of text. A
|
158
|
-
# default coder recognises the following
|
159
|
-
# A 'Heading', marked by a line **NAME OF HEADING**
|
160
|
-
# A 'Speaker', marked by a line SpeakerName:
|
161
|
-
#
|
162
|
-
# After the filter has run, the results of the coding can be
|
163
|
-
# retrieved by calling Autocoder#codes
|
164
|
-
# This is a hash of codetype names to inner hashes of codevalue names
|
165
|
-
# (strings) to QDA::Codesets corresponding to them.
|
166
|
-
class AutoCoder < Indexer
|
167
|
-
STANDARD_TRIGGER_RULES = {
|
168
|
-
/^(\w+)\:\s*$/ => 'Speaker',
|
169
|
-
/^\*\*(.*)\*\*$/ => 'Heading'
|
170
|
-
}
|
171
|
-
|
172
|
-
attr_reader :codes
|
173
|
-
# +rules+ should be a hash of string keys, naming types of autocode
|
174
|
-
# (e.g. "Speaker", "Heading", "Topic") mapped to values, which
|
175
|
-
# should be regular expressions specifying how the start of such a
|
176
|
-
# code should be recognised.
|
177
|
-
# For example, to find topics marked by the characters '##' at the
|
178
|
-
# start of the line:
|
179
|
-
# 'Heading' => /^##(.*)$/
|
180
|
-
def initialize(rules = STANDARD_TRIGGER_RULES)
|
181
|
-
super()
|
182
|
-
@trigger_rules = rules
|
183
|
-
@codes = Hash.new { | h, k | h[k] = {} }
|
184
|
-
@curr_codes = {}
|
185
|
-
end
|
186
|
-
|
187
|
-
# check a line of document content for triggers
|
188
|
-
def feed(line)
|
189
|
-
@trigger_rules.each do | rule, type |
|
190
|
-
if match = rule.match(line)
|
191
|
-
trigger(cursor, type, match[1])
|
192
|
-
end
|
193
|
-
end
|
194
|
-
super
|
27
|
+
|
28
|
+
def import(filter, content)
|
29
|
+
obj = filter.run(content)
|
30
|
+
yield obj, filter if block_given?
|
31
|
+
obj
|
195
32
|
end
|
196
|
-
|
197
|
-
#
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
new_codeset = get_code(group, codename)
|
202
|
-
@curr_codes[group] = [ new_codeset, cursor ]
|
33
|
+
|
34
|
+
# Returns a hash of all available import filter types, keyed on Weft
|
35
|
+
# classes (eg Document
|
36
|
+
def import_filters()
|
37
|
+
@@import
|
203
38
|
end
|
204
|
-
|
205
|
-
|
206
|
-
#
|
207
|
-
#
|
208
|
-
def
|
209
|
-
|
210
|
-
@codes[group][codename] = QDA::CodeSet.new()
|
39
|
+
|
40
|
+
|
41
|
+
# Returns a hash of all available export filter types, keyed on Weft
|
42
|
+
# classes (eg Document)
|
43
|
+
def export_filters()
|
44
|
+
@@export
|
211
45
|
end
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
def each_autocode(group)
|
216
|
-
@codes[group].each { | name, codeset | yield name, codeset }
|
46
|
+
|
47
|
+
def find_import_filter( weft_class, ext )
|
48
|
+
@@import[weft_class].find { | filter | filter::EXTENSIONS.include?(ext) }
|
217
49
|
end
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
def apply(docid)
|
222
|
-
@codes.values.each do | group |
|
223
|
-
group.values.each do | codeset |
|
224
|
-
codeset.map! { | x | x.docid = docid; x }
|
225
|
-
end
|
226
|
-
end
|
50
|
+
|
51
|
+
def find_export_filter( weft_class, ext )
|
52
|
+
@@export[weft_class].find { | filter | filter::EXTENSION == ext }
|
227
53
|
end
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
@curr_codes.each_key { | group | store(group) }
|
54
|
+
|
55
|
+
def can_export?(weft_class)
|
56
|
+
@@export.has_key?(weft_class)
|
232
57
|
end
|
233
|
-
|
234
|
-
# finish the coding for the current code being used among +group+
|
235
|
-
def store(group)
|
236
|
-
codeset, start = @curr_codes[group]
|
237
|
-
# -1 here is a placeholder
|
238
|
-
terminus = cursor - start
|
239
|
-
codeset.add( Code.new(-1, start, terminus) )
|
58
|
+
|
240
59
|
end
|
241
|
-
private :store
|
242
60
|
end
|
61
|
+
|
62
|
+
require 'weft/filters/indexers'
|
63
|
+
require 'weft/filters/output'
|
64
|
+
require 'weft/filters/input'
|
65
|
+
require 'weft/filters/templates'
|
243
66
|
end
|