RubyGems - weft-qda - Versions diffs - 0.9.6 → 0.9.8 - Mend

weft-qda 0.9.6 → 0.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

data/lib/weft.rb +16 -1
data/lib/weft/WEFT-VERSION-STRING.rb +1 -1
data/lib/weft/application.rb +17 -74
data/lib/weft/backend.rb +6 -32
data/lib/weft/backend/sqlite.rb +222 -164
data/lib/weft/backend/sqlite/category_tree.rb +52 -48
data/lib/weft/backend/sqlite/database.rb +57 -0
data/lib/weft/backend/sqlite/upgradeable.rb +7 -0
data/lib/weft/broadcaster.rb +90 -0
data/lib/weft/category.rb +139 -47
data/lib/weft/codereview.rb +160 -0
data/lib/weft/coding.rb +74 -23
data/lib/weft/document.rb +23 -10
data/lib/weft/exceptions.rb +10 -0
data/lib/weft/filters.rb +47 -224
data/lib/weft/filters/indexers.rb +137 -0
data/lib/weft/filters/input.rb +118 -0
data/lib/weft/filters/output.rb +101 -0
data/lib/weft/filters/templates.rb +80 -0
data/lib/weft/filters/win32backtick.rb +246 -0
data/lib/weft/query.rb +169 -0
data/lib/weft/wxgui.rb +349 -294
data/lib/weft/wxgui/constants.rb +43 -0
data/lib/weft/wxgui/controls.rb +6 -0
data/lib/weft/wxgui/controls/category_dropdown.rb +192 -0
data/lib/weft/wxgui/controls/category_tree.rb +314 -0
data/lib/weft/wxgui/controls/document_list.rb +97 -0
data/lib/weft/wxgui/controls/multitype_control.rb +37 -0
data/lib/weft/wxgui/{inspectors → controls}/textcontrols.rb +235 -64
data/lib/weft/wxgui/dialogs.rb +144 -41
data/lib/weft/wxgui/error_handler.rb +116 -36
data/lib/weft/wxgui/exceptions.rb +7 -0
data/lib/weft/wxgui/inspectors.rb +61 -208
data/lib/weft/wxgui/inspectors/category.rb +19 -16
data/lib/weft/wxgui/inspectors/codereview.rb +90 -132
data/lib/weft/wxgui/inspectors/document.rb +12 -8
data/lib/weft/wxgui/inspectors/imagedocument.rb +56 -56
data/lib/weft/wxgui/inspectors/query.rb +284 -0
data/lib/weft/wxgui/inspectors/script.rb +147 -23
data/lib/weft/wxgui/lang/en.rb +69 -0
data/lib/weft/wxgui/sidebar.rb +90 -432
data/lib/weft/wxgui/utilities.rb +70 -91
data/lib/weft/wxgui/workarea.rb +150 -43
data/share/icons/category.ico +0 -0
data/share/icons/category.xpm +109 -0
data/share/icons/codereview.ico +0 -0
data/share/icons/codereview.xpm +54 -0
data/share/icons/d_and_c.xpm +126 -0
data/share/icons/document.ico +0 -0
data/share/icons/document.xpm +70 -0
data/share/icons/project.ico +0 -0
data/share/icons/query.ico +0 -0
data/share/icons/query.xpm +56 -0
data/{lib/weft/wxgui → share/icons}/search.xpm +0 -0
data/share/icons/weft.ico +0 -0
data/share/icons/weft.xpm +62 -0
data/share/icons/weft16.ico +0 -0
data/share/icons/weft32.ico +0 -0
data/share/templates/category_plain.html +18 -0
data/share/templates/codereview_plain.html +18 -0
data/share/templates/document_plain.html +13 -0
data/share/templates/document_plain.txt +7 -0
data/test/001-document.rb +55 -36
data/test/002-category.rb +81 -6
data/test/003-code.rb +8 -4
data/test/004-application.rb +13 -34
data/test/005-query_review.rb +139 -0
data/test/006-filters.rb +54 -42
data/test/007-output_filters.rb +113 -0
data/test/009a-backend_sqlite_basic.rb +95 -24
data/test/009b-backend_sqlite_complex.rb +43 -62
data/test/009c_backend_sqlite_bench.rb +5 -10
data/test/053-doc_inspector.rb +46 -0
data/test/055-query_window.rb +50 -0
data/test/all-tests.rb +1 -0
data/test/test-common.rb +19 -0
data/test/testdata/empty.qdp +0 -0
data/test/testdata/simple with space.pdf +0 -0
data/test/testdata/simple.pdf +0 -0
data/weft-qda.rb +40 -7
metadata +74 -14
data/lib/weft/wxgui/category.xpm +0 -26
data/lib/weft/wxgui/document.xpm +0 -25
data/lib/weft/wxgui/inspectors/search.rb +0 -265
data/lib/weft/wxgui/mondrian.xpm +0 -44
data/lib/weft/wxgui/weft16.xpm +0 -31

data/lib/weft/codereview.rb ADDED

@@ -0,0 +1,160 @@
+module QDA
+  # CodeReview is a class that is used for cross-tabulation of coding. It makes
+  # it possible to get statistics for the number of characters, passages and
+  # documents that are coded by both the row column and the
+  class CodeReview
+    attr_accessor :dbid, :count_method
+    attr_reader :cols, :rows, :contents
+    # A new CodeReview is empty when initialised
+    def initialize()
+      @cols, @rows, @contents = [], [], []
+      @count_method = :num_of_docs
+    end
+    # returns the total number of columns
+    def number_cols()
+      @cols.length
+    end
+    # returns the index of the last column
+    def last_col()
+      @cols.length - 1
+    end
+    # takes a block, yielding each column Category and its index in turn
+    def each_col()
+      @cols.each_with_index { | col, i | yield col, i }
+    end
+    # add the Category +category+ as the last column
+    def add_col(category)
+      return nil unless category
+      return nil if @cols.include?(category)
+      @cols.push(category)
+      @rows.each_with_index do | row_cat, i |
+        @contents[i][last_col] = row_cat.codes.dup.join(category.codes)
+      end
+    end
+    # Updates the column with the changed Category +category+. Useful in a
+    # persistent environment where user actions may have altered the coding.
+    def update_col(category)
+      return nil unless category
+      return nil unless idx = @rows.index(category)
+      @rows[idx] = category
+      @cols.each_with_index do | col_cat, j |
+        @contents[idx][j] = col_cat.codes.dup.join(category.codes)
+      end
+      return idx
+    end
+    # Removes the Category +category+ as a column from the CodeReview. Returns
+    # the index of the removed category, if found, or nil, if not.
+    def remove_col(category)
+      return nil unless category
+      return nil unless idx = @cols.index(category)
+      @cols.delete_at(idx)
+      @contents.each { | row | row.delete_at(idx) }
+      return idx
+    end
+    # returns the total number of rows in the CodeReview
+    def number_rows()
+      @rows.length
+    end
+    # returns the index of the last row in the CodeReview
+    def last_row()
+      @rows.length - 1
+    end
+    def each_row()
+      @rows.each_with_index { | r, i | yield r, i }
+    end
+    # appends the category +category+ as the last row. Returns the appended
+    # category if it was successfully added, or nil f not - for example, if
+    def add_row(category)
+      return nil unless category
+      return nil if @rows.include?(category)
+      @rows.push(category)
+      @contents[last_row] = []
+      @cols.each_with_index do | col_cat, j |
+        @contents[last_row][j] = col_cat.codes.dup.join(category.codes)
+      end
+    end
+    def update_row(category)
+      return nil unless category
+      return nil unless idx = @rows.index(category)
+      @rows[idx] = category
+      @cols.each_with_index do | col_cat, j |
+        @contents[idx][j] = col_cat.codes.dup.join(category.codes)
+      end
+      return idx
+    end
+    # Removes the Category +category+ from the rows of this CodeReview.
+    # Returns the index of the corresponding category, if found, or nil, if not.
+    def remove_row(category)
+      return nil unless category
+      return nil unless idx = @rows.index(category)
+      @rows.delete_at(idx)
+      @contents.delete_at(idx)
+      return idx
+    end
+    def each_cell()
+      0.upto(last_row) do | i |
+        0.upto(last_col) { | j | yield i, j, @contents[i][j] }
+      end
+    end
+    # loops over the contents of this code review, yielding each cell's location
+    # and value (calculated by +meth+, defaulting to the code review's current
+    # +count_method+. Values are yielded as follows
+    #
+    #  code_review.each_cell { | row_num, col_num, cell_value |
+    def each_cell_value(meth = @count_method)
+      each_cell { | i, j, cell | yield i, j, cell.send(meth) }
+    end
+    # returns the maximum value among the codereview contents using the metric
+    # +method+ - which should be a method called upon QDA::CodingTable
+    def max(meth = @count_method)
+      @contents.flatten.collect { | x | x.send(meth) }.max
+    end
+    # returns the minimum value among the codereview contents using the metric
+    # +method+ - which should be a method called upon QDA::CodingTable
+    def min(meth = @count_method)
+      @contents.flatten.collect { | x | x.send(meth) }.min
+    end
+    # returns the current content as a series of rows; if +with_array+ is true,
+    # a header row of column names will be the first row, and each subsequent
+    # row will have the name of the row as the first entry.
+    def output_rows(with_header = true)
+      out_rows = []
+      out_rows << [ '', *cols.map { | cat | cat.name } ] if with_header
+      each_row do | row, i |
+        this_row = contents[i].map { | isect | isect.send(count_method) }
+        this_row.unshift(row.name) if with_header
+        out_rows.push(this_row)
+      end
+      out_rows
+    end
+    def to_query(app, x, y)
+      return nil unless rows[x] and cols[y]
+      query = Query.new( Query::CodedByFunction.new(app, rows[x]) )
+      query.add_expression( 'AND', Query::CodedByFunction.new(app, cols[y]) )
+      query
+    end
+  end
+end

data/lib/weft/coding.rb CHANGED

@@ -141,6 +141,22 @@ module QDA
       end
       super(arr)
     end
+    def items
+      self
+    end
+    def docid
+      first ? first.docid : nil
+    end
+    def title
+      first ? first.title : nil
+    end
+    def num_of_chars()
+      inject(0) { | total, code| total += code.length }
+    end
     # iterate over each successive neighbouring pair of codings in
     # the set, i.e. items 1, 2; items 2,3; items 3, 4 ..  items n-1,
@@ -243,18 +259,17 @@ module QDA
       self[item.docid].subtract(item)
     end
-    def num_of_docs
+    def num_of_docs()
       keys.reject { | set | self[set].length == 0 }.length
     end
-    def num_of_codes
+    def num_of_codes()
       values.inject(0) { | count, codeset | count + codeset.length }
     end
+    alias :num_of_passages :num_of_codes
-    def num_of_chars
-      values.inject(0) do | total, codes |
-        codes.inject(total) { | sub_total, code | sub_total + code.length }
-      end
+    def num_of_chars()
+      values.inject(0) { | count, codeset | count += codeset.num_of_chars }
     end
     # returns true if this coding table contains coding for the
@@ -266,40 +281,51 @@ module QDA
     # Adds the coding of the other coding table +other+ to this one,
     # modifying +self in place
     def merge(other)
-      results = CodingTable.new()
+      results = self.class.new()
       either = self.keys + other.keys
       either.uniq.each do | docid |
         if ! other[docid]
-          results[docid] = self[docid]
+          results.set(docid, self[docid])
         elsif ! self[docid]
-          results[docid] = other[docid]
+          results.set(docid, other[docid])
         else
-          results[docid] = self[docid].union(other[docid])
+          results.set( docid, self[docid].union(other[docid]) )
         end
       end
-      replace(results)
+      return results
     end
+    def merge!(other)
+      replace( merge(other) )
+    end
     # Removes all coding from this table that occurs in the other table
     # +other+, modifying this CodingTable in place
     def remove(other)
-      results = CodingTable.new()
+      results = self.class.new()
       each do | docid, codes |
-        results[docid] = self[docid].exclude(other[docid])
+        results.set(docid, codes.exclude( other[docid] ) )
       end
-      replace(results)
+      return results
     end
+    def remove!(other)
+      replace( remove(other) )
+    end
     # deletes all coding except that which is also covered by +other+
     def join(other)
       both = keys.find_all { | doc | other.key?(doc) }
-      results = CodingTable.new()
+      results = self.class.new()
       both.each do | docid |
-        results[docid] = self[docid].intersect( other[docid] )
+        results.set(docid, self[docid].intersect( other[docid] ) )
       end
-      replace(results)
+      return results
     end
+    def join!
+      replace( join(other) )
+    end
     def sort(&block)
       if block_given
         super(&block)
@@ -307,6 +333,14 @@ module QDA
         super { | a, b | a <=> b }
       end
     end
+    def sets()
+      values_at( *keys.sort )
+    end
+    def each_set()
+      keys.sort.each { | docid | yield self[docid] }
+    end
   end
   # a FragmentTable holds a collection of fragments. It contains a
@@ -327,7 +361,14 @@ module QDA
     def [](k)
       k.kind_of?(String) ? super(@titles[k]) : super(k)
     end
+    def set(docid, fragset)
+      super(docid, fragset)
+      if fragset[0] and fragset[0].respond_to?(:doctitle)
+        @titles[fragset[0].doctitle] = fragset[0].docid
+      end
+    end
     # Always use this method to add fragments to the collection
     def add(fragment)
       unless fragment.is_a?(Fragment)
@@ -337,11 +378,21 @@ module QDA
       @titles[fragment.doctitle] = fragment.docid
     end
+    def titles()
+      @titles.keys.sort
+    end
     def each_title()
-      titles = @titles.keys.sort
-      titles.each do | title |
-        yield title, self[ @titles[title] ]
-      end
+      titles.each { | title | yield title, self[ @titles[title] ] }
+    end
+    def sets
+      docids = titles.map { | t | @titles[t] }
+      values_at( *docids )
+    end
+    def each_set
+      titles.each { | title | yield self[ @titles[title] ] }
     end
     def to_codingtable()

data/lib/weft/document.rb CHANGED

@@ -27,7 +27,15 @@ class Fragment < String
     # of the document - duplicates role of doctitle - to fix
     @docid    = docid
   end
+  def title
+    @doctitle
+  end
+  def text
+    self.to_s()
+  end
   def ==(other)
     super(other) and
       @offset == other.offset and
@@ -61,6 +69,13 @@ class Fragment < String
                   @doctitle, abs, @docid )
   end
+  def scan(pattern)
+    super do | m |
+      yield Fragment.new(m, @doctitle,
+                         offset + Regexp.last_match.begin(0), @dbid )
+    end
+  end
   def inspect()
     str = length < 50 ? self.to_s : self.to_s[0, 50] << '...'
     "<*Fragment #{docid} #{offset}-#{self.end} : '#{str}>"
@@ -72,18 +87,15 @@ class Document < Fragment
   attr_accessor :title, :memo
   # expects dbid to be set later
-  def initialize(title, text = '', memo = '',
-                 create_date = nil, mod_date = nil)
+  def initialize( title, text = '', memo = '',
+                  create_date = Time.now(),
+                  mod_date = Time.now() )
     super(text, title, 0)
 	@title = title
     @memo  = memo
     @create_date = create_date
-    @mod_date    = mod_date
-  end
-  def text
-    self.to_s
+    @mod_date    = mod_date
   end
   def dbid=(dbid)
@@ -98,7 +110,7 @@ class Document < Fragment
   def create()
     @create_date = Time.now()
   end
   #   def append(text, fragtype = 0)
   # returns the number of characters appended
   def append(text, term_char = "\n")
@@ -114,5 +126,6 @@ class Document < Fragment
   def inspect()
     "<*Document #{dbid} '#{title}' (#{length} chars)>"
   end
 end
 end

data/lib/weft/exceptions.rb ADDED

@@ -0,0 +1,10 @@
+module QDA
+  class NotUniqueNameError < ArgumentError
+  end
+  class BadNameError < ArgumentError
+  end
+  class BadStructureError < ArgumentError
+  end
+  class NotFoundError < StandardError
+  end
+end

data/lib/weft/filters.rb CHANGED

@@ -3,241 +3,64 @@ require 'weft/coding'
 require 'English'
 module QDA
-  class InputFilter
-    attr_reader :cursor
-	def initialize()
-      @cursor = 0
-      @indexers = []
-	end
-    def add_indexer(indexer)
-      unless indexer.respond_to?(:feed)
-        raise "Document indexers should have a feed method"
-      end
-      @indexers.push(indexer)
-    end
-    # reads +file+ and creates a new document titled +doctitle+. +file+
-    # may be a String filename or an open stream.
-    # Under the hood, calls +read_content+ to extract the content. This
-    # method must be implemented in subclasses. Then +process_content+
-    # is called to create the documents text. This class does something
-    # reasonable with plain text, but structured text formats will want
-    # to subclass this method to process non-text information (for
-    # example, HTML or XML tags)
-	def read(file, doctitle)
-      @content = ''
-	  case file
-	  when IO
-		@content = file.read()
-      when QDA::Document
-        @content = file.text()
-	  when String
-        @content = File.read(file)
-	  end
-      process_content(doctitle)
-    end
-    def process_content(doctitle)
-      # signal to indexers we're about to start
-      @indexers.each { | indexer | indexer.prepare(@content) }
-	  doc = QDA::Document.new(doctitle)
-	  @content.each_line do | line |
-        doc.append(line.to_s.chomp)
-        # inform AutoCoders, reverse indexers and so on.
-        @indexers.each { | indexer | indexer.feed(line) }
-	  end
-      @indexers.each { | indexer | indexer.terminate() }
-	  doc.create
-      return doc
-    end
-  end
-  class TextFilter < InputFilter
-    EXTENSIONS = [ 'txt' ]
-	def read_content(file)
-      text = file.read()
-      file.close()
-      text
-	end
-  end
-  class PDFFilter < InputFilter
-    EXTENSIONS = [ 'pdf' ]
-    PDF_TO_TEXT_EXEC = 'pdftotext'
-    begin
-      out = `#{PDF_TO_TEXT_EXEC} -v 2>&1`
-      unless out =~ /pdftotext version 3/
-        warn 'PDFtotext Version 3 not found in path' +
-             'PDF Filters will not be avaialabl'
+  module Filters
+    @@import = Hash.new { |  h, k | h[k] = [] }
+    @@export = Hash.new { |  h, k | h[k] = [] }
+    class << self
+    def register_filter( filter_class )
+      if defined? filter_class::IMPORT_CLASS
+        @@import[filter_class::IMPORT_CLASS].push(filter_class)
       end
-    end
-    NO_COPYING_ERROR_TEXT =
-      "The author or publisher of this PDF document has locked it to
-prevent copying and extraction of its text. It is not possible to
-import this document."
-    def read(file, doctitle)
-	  case file
-	  when IO
-        raise NotImplementedError
-		@content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file.path} - 2>&1`
-        file.close()
-	  when String
-        @content = `#{PDF_TO_TEXT_EXEC} -nopgbrk #{file} - 2>&1`
-	  end
-      case $CHILD_STATUS
-      when 0
-        process_content(doctitle)
-      when 3
-        raise RuntimeError.new(NO_COPYING_ERROR_TEXT)
-      else
-        raise RuntimeError.new("Could not extract PDF text: #{text}")
+      if defined? filter_class::EXPORT_CLASS
+        @@export[filter_class::EXPORT_CLASS].push(filter_class)
       end
     end
-  end
-  class OutputFilter
-  end
-  # ...
-  class HTMLFilter < OutputFilter
-  end
-  class Indexer
-    attr_reader :cursor
-    def initialize()
-      @cursor = 0
+    # imports an object of class +klass+ e.g. QDA::Document from the file
+    # +filename+, which should be a string.
+    def import_file(klass, filename, opts = {}, &block)
+      ext = filename[-3,3]
+      filter = Filters.find_import_filter(klass, ext).new()
+      import(filter, filename, &block)
     end
-    def index(str)
-      prepare(str)
-      str.each_line { | line | feed(line) }
-    end
-    def terminate()
-    end
-    def prepare(content)
-    end
-    def feed(line)
-      @cursor += line.length
-    end
-  end
-  # An indexer which records the position of words for later reverse
-  # retrieval
-  class WordIndexer <  Indexer
-    attr_reader :words
-    # includes accented latin-1 characters
-    WORD_TOKENIZER = /[\w\xC0-\xD6\xD8-\xF6\xF8-\xFF][\w\xC0-\xD6\xD8-\xF6\xF8-\xFF\']+/
-    def initialize()
-      super
-      @words = Hash.new { | h, k | h[k] = [] }
-    end
-    def feed(line)
-      line.scan( WORD_TOKENIZER ) do | word |
-        next if word.length == 1
-        @words[word].push(cursor + Regexp.last_match.begin(0))
-      end
-      super
-    end
-  end
-  # An indexer that uses text patterns to identify, for example,
-  # passages by a particular speaker, or text headings.
-  # The indexer can recognise a number of different types of codes,
-  # each denoted by a pattern of punctuation in a line of text. A
-  # default coder recognises the following
-  # A 'Heading', marked by a line **NAME OF HEADING**
-  # A 'Speaker', marked by a line SpeakerName:
-  #
-  # After the filter has run, the results of the coding can be
-  # retrieved by calling Autocoder#codes
-  # This is a hash of codetype names to inner hashes of codevalue names
-  # (strings) to QDA::Codesets corresponding to them.
-  class AutoCoder < Indexer
-    STANDARD_TRIGGER_RULES = {
-      /^(\w+)\:\s*$/   => 'Speaker',
-      /^\*\*(.*)\*\*$/ => 'Heading'
-    }
-    attr_reader :codes
-    # +rules+ should be a hash of string keys, naming types of autocode
-    # (e.g. "Speaker", "Heading", "Topic") mapped to values, which
-    # should be regular expressions specifying how the start of such a
-    # code should be recognised.
-    # For example, to find topics marked by the characters '##' at the
-    # start of the line:
-    # 'Heading' => /^##(.*)$/
-    def initialize(rules = STANDARD_TRIGGER_RULES)
-      super()
-      @trigger_rules = rules
-      @codes      = Hash.new { | h, k | h[k] = {} }
-      @curr_codes = {}
-    end
-    # check a line of document content for triggers
-    def feed(line)
-      @trigger_rules.each do | rule, type |
-        if match = rule.match(line)
-          trigger(cursor, type, match[1])
-        end
-      end
-      super
+    def import(filter, content)
+      obj = filter.run(content)
+      yield obj, filter if block_given?
+      obj
     end
-    # take action on finding a autocode marker
-    def trigger(cursor, group, codename)
-      # save any previous code that was being done for this group
-      store(group) if @curr_codes[group]
-      new_codeset = get_code(group, codename)
-      @curr_codes[group] = [ new_codeset, cursor ]
+    # Returns a hash of all available import filter types, keyed on Weft
+    # classes (eg Document
+    def import_filters()
+      @@import
     end
-    private :trigger
-    # returns the code name +codename+ within the group +group+,
-    # creating a new empty category
-    def get_code(group, codename)
-      return @codes[group][codename] if @codes[group][codename]
-      @codes[group][codename] = QDA::CodeSet.new()
+    # Returns a hash of all available export filter types, keyed on Weft
+    # classes (eg Document)
+    def export_filters()
+      @@export
     end
-    # Returns the names and codesets for autocodes in group +group+
-    # in a series of pairs
-    def each_autocode(group)
-      @codes[group].each { | name, codeset | yield name, codeset }
+    def find_import_filter( weft_class, ext )
+      @@import[weft_class].find { | filter | filter::EXTENSIONS.include?(ext) }
     end
-    # alters all the stored coding in this autocoder so that it refers
-    # to the document identified by +docid+
-    def apply(docid)
-      @codes.values.each do | group  |
-        group.values.each do | codeset |
-          codeset.map! { | x | x.docid = docid; x }
-        end
-      end
+    def find_export_filter( weft_class, ext )
+      @@export[weft_class].find { | filter | filter::EXTENSION == ext }
     end
-    # finish up all currently active coding in this autocoder
-    def terminate()
-      @curr_codes.each_key { | group | store(group) }
+    def can_export?(weft_class)
+      @@export.has_key?(weft_class)
     end
-    # finish the coding for the current code being used among +group+
-    def store(group)
-      codeset, start = @curr_codes[group]
-      # -1 here is a placeholder
-      terminus = cursor - start
-      codeset.add( Code.new(-1, start, terminus) )
     end
-    private :store
   end
+  require 'weft/filters/indexers'
+  require 'weft/filters/output'
+  require 'weft/filters/input'
+  require 'weft/filters/templates'
 end