RubyGems - lumix - Versions diffs - 0.0.2-java - Mend

lumix 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/COPYING +18 -0
data/bin/lumix +4 -0
data/bin/lumix-gui +4 -0
data/lib/lumix/base.rb +56 -0
data/lib/lumix/charset.rb +35 -0
data/lib/lumix/cli.rb +96 -0
data/lib/lumix/concordancer.rb +254 -0
data/lib/lumix/corrections.rb +84 -0
data/lib/lumix/fast_search.rb +91 -0
data/lib/lumix/filter.rb +89 -0
data/lib/lumix/gui.rb +148 -0
data/lib/lumix/lookup.rb +105 -0
data/lib/lumix/lookup_filter.rb +43 -0
data/lib/lumix/lookup_search.rb +95 -0
data/lib/lumix/main.rb +7 -0
data/lib/lumix/model/base_models.rb +35 -0
data/lib/lumix/model/maglev_models.rb +42 -0
data/lib/lumix/model/mock_models.rb +46 -0
data/lib/lumix/model/sequel_models.rb +53 -0
data/lib/lumix/proto/lookup.rb +105 -0
data/lib/lumix/proto/lookup_filter.rb +40 -0
data/lib/lumix/proto/lookup_search.rb +81 -0
data/lib/lumix/result_view.rb +93 -0
data/lib/lumix/schema/001_create_tables.rb +35 -0
data/lib/lumix/schema/002_categories.rb +28 -0
data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
data/lib/lumix/slow_search.rb +104 -0
data/lib/lumix/text_snippet.rb +29 -0
data/lib/lumix/textprocessing.rb +108 -0
data/lib/lumix/thread_pool.rb +127 -0
data/spec/filter_spec.rb +55 -0
data/spec/lookup_spec.rb +70 -0
data/spec/text_snippet_spec.rb +55 -0
metadata +175 -0

data/lib/lumix/filter.rb ADDED Viewed

@@ -0,0 +1,89 @@
+module Lumix
+  class Filter
+    HANDLERS = %w[handle_wildcard handle_choice handle_literals
+              handle_dangling_tags handle_multiplicators ensure_wordbounds]
+    attr_reader :results, :filter
+    def initialize(suffix, filter, &result_proc)
+      @suffix = suffix.gsub(/\\\|/, '[\|]') # workaround to make handle_dangling_tags play nicely
+      @filter = filter
+      @result_proc = result_proc
+      @re = to_re(filter)
+      @results = 0
+    end
+    def <<(result)
+      @results += 1
+      @result_proc[*result] if @result_proc
+    end
+    def scan(text, &block)
+      results = []
+      return results unless text
+      (' ' + text + ' ').scan(@re) do |s|
+        t_begin = $~.begin(0) - 1
+        t_end = $~.end(0) - 1
+        s = block ? block[s, t_begin, t_end, $~] : s
+        results << s
+      end
+      results
+    end
+    def to_re(filter)
+      re = HANDLERS.inject(filter) do |filter, handler|
+        puts filter
+        puts "#{handler} -->"
+        send handler, filter
+      end
+      puts re
+      Regexp.new(re)
+    end
+    # character wildcard replacement
+    def handle_wildcard(re)
+      re.gsub(/([^\)])\*/, '\1[^\s\|]*')
+    end
+    # Takes (!A B C) and transforms it
+    def handle_choice(re)
+      re.gsub(/\(\!([^\)]+)\)/) do
+        c = $1.split.map{ |t| '(?!' + t + ')' }.join
+        '(?:' + c + '[^\s\|]*' + @suffix + ')'
+      end
+    end
+    # transforms literals delimited by ""
+    def handle_literals(re)
+      re.gsub(/\"([^\"]*)\"(?:\|(\S+?))?/) do
+        str = $1
+        tag = $2 || '[^\s\|]+'
+        str.gsub(/ /, '_') + '\|' + tag
+      end
+    end
+    # add wildcard word match on tag-only search criteria
+    def handle_dangling_tags(re)
+      re.split(/ /).map do |s|
+        if s =~ /\|[^\]]/
+          s + @suffix
+        else
+          s.gsub(/(\(?)([^\)]+)(\S*)/, '\1[^\s\|]+\|\2' + @suffix + '\3')
+        end
+      end.join('\s+')
+    end
+    # Handles the + * ? and {} qualifiers
+    def handle_multiplicators(re)
+      re.gsub(/\(([^\)]+)(\)((\{[^\}]+\})|\*|\+|\?)\s?)/, '(?:\1\s\2')
+    end
+    def ensure_wordbounds(re)
+      re # ending wordbounds is being taken of earlier
+    end
+  end
+end

data/lib/lumix/gui.rb ADDED Viewed

@@ -0,0 +1,148 @@
+require 'lumix/base'
+require 'sweet'
+require 'lumix/result_view'
+#Sweet.set_debug
+Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
+Indicator = %w'} ) ] | [ ( {'
+CONF = 'config.yaml'
+ConfigStruct = Struct.new(:database_uri)
+CConfig = YAML.load_file(CONF) rescue ConfigStruct.new('jdbc:postgresql://localhost:5432/concordancer?user=concordancer&password=concordancer')
+def save!
+  File.open(CONF, 'w') do |f|
+    f.write(CConfig.to_yaml)
+  end
+end
+Sweet.app :title => 'Ruby Concordancer', :width => 800, :height => 700, :layout => :grid.conf(:numColumns => 3) do
+  def conc
+    @conc ||= Concordancer.new(CConfig.database_uri, :progress_proc => @progress_proc)#, :recreate => true)
+  end
+  @progress_proc = proc do |p|
+    task = Texts[p.task] || p.task
+    perform do
+      if p.done == p.work
+        @p_status.text = 'Done!'
+        @p_indicator.text = ''
+        @p_bar.fraction = 0
+      else
+        @p_status.text = task
+        @p_indicator.text = Indicator[p.done % Indicator.size]
+        @p_bar.fraction = p.done.to_f / p.work
+      end
+    end
+  end
+  save! unless File.exists?(CONF)
+  menubar do
+    submenu '&File' do
+      submenu '&Import...' do
+        item('E&nglish texts') { import_chooser('en') }
+        item('&Romanian texts') { import_chooser('ro') }
+      end
+      item('&Export findings...') { export_findings }
+      separator
+      item('&Relink texts') { relink }
+      item('&Clear the database') { reconnect :recreate => true }
+      separator
+      item('E&xit') { exit }
+    end
+    #    submenu 'C&orpora' do
+    #      @m_cat = submenu '&Category' do
+    #        item('Cre&ate...') { create_category }
+    #        item('&Import...') { import_chooser }
+    #        separator
+    #        item('&Edit...') { edit_category }
+    #        item('&Delete') { delete_category }
+    #      end
+    #      @m_text = submenu '&Text' do
+    #        item('&Reimport...') { reimport_chooser }
+    #        item('&Delete') { delete_text }
+    #      end
+    #    end
+    #    @m_stats = submenu '&Statistics' do
+    #      item('&Editor') { script_editor }
+    #      separator
+    #      item('&Load Script...') { load_script }
+    #    end
+    #    submenu "&Help" do
+    #      separator
+    #      item('&About') { about }
+    #    end
+  end
+  tree :grid_data => {:align => [:fill, :fill], :span => [1, 2], :grab => [true, true]}
+  @filter = edit_line 'NSN NSN', :grid_data => {:align => [:fill, :center], :grab => true}, :max_size => 40 do
+    perform_search
+  end
+  button 'Search' do
+    perform_search
+  end
+  @results = table :columns => %w[Text Left Hit Right], :sort => true, :grid_data => {:align => [:fill, :fill], :span => 2, :grab => [true, true]}, :scroll => true
+  @counter = label :grid_data => {:span => 2, :align => :fill}
+  @p_status = label(:grid_data => {:align => [:fill, :bottom], :grab => true})
+  @p_bar = progress(:width => 50, :grid_data => {:align => [:right, :bottom]})
+  @p_indicator = label('  ',  :grid_data => {:align => [:right, :bottom]})
+  def perform_search
+    filter = @filter.text
+    @results.data.clear
+    Thread.new do
+      unless filter.empty?
+        puts "finding #{filter}"
+        found = conc.find(filter) do |text, tagged|
+          @results.add_hit(text.name, text.left, text.to_s, text.right)
+        end
+      end
+      perform do
+        @counter.text = "#{found} matches"
+        @p_status.text = "Found #{found || 'no'} matches for #{filter}"
+      end
+    end
+  end
+  def import_chooser(lang)
+    conc.tp.lang = lang
+    Thread.new(conc) do |conc|
+      conc.read('raw')
+    end
+  end
+  def export_findings
+    filename = to_filename(@filter.text) + '.findings'
+    @p_status.text = "Exporting to #{filename}"
+    File.open(filename, 'w') do |f|
+      @results.items.each do |item|
+        unless item.getChecked
+          left, hit, right = (0..2).map{ |i| item.text(i) }
+          f.puts "#{left}\t#{hit}\t#{right}"
+        end
+      end
+    end
+    @p_status.text = "Done! Exported to file #{filename}"
+  end
+  def relink
+    Thread.new(conc) do |conc|
+      conc.link!
+    end
+  end
+  def to_filename(filter)
+    filter.gsub(/\s+/, "_").gsub(/[\*\.\?\"]/, '')
+  end
+  def reconnect(opts = {})
+    @conc = Concordancer.new(CConfig.database_uri, opts.mergs(:progress_proc => @progress_proc))
+  end
+end

data/lib/lumix/lookup.rb ADDED Viewed

@@ -0,0 +1,105 @@
+module Lumix
+  class Lookup
+    class Document
+      def initialize(lookup)
+        @tokens_ds = lookup.tokens
+        @words = lookup.words
+        @tags = lookup.tags
+        @tokens = []
+      end
+      def add_token(text_id, position, word, tag, s_begin, s_end, t_begin, t_end)
+        @tokens << {:text_id => text_id, :position => position, :word_id => @words[word], :tag_id => @tags[tag],
+          :src_begin => s_begin, :src_end => s_end, :tagged_begin => t_begin, :tagged_end => t_end}
+      end
+      def flush
+        tokens, @tokens = @tokens, [] # make sure no double-flush occurs
+        @tokens_ds.multi_insert tokens
+      end
+    end
+    class LookupCollection < Hash
+      def initialize(ds, column)
+        @ds = ds
+        @column = column
+        super(){ |h,k| h[k] = create(k) }
+        @ds.each do |e|
+          self[e[@column]] = e[:id]
+        end
+      end
+      def create(value)
+        @ds.db.transaction(:isolation => :serializable) do
+          @ds.where(@column => value).select(:id).single_value || @ds.insert(@column => value)
+        end
+      end
+    end
+    attr_reader :tokens, :db
+    def initialize(db)
+      puts "Lookup"
+      @db = db
+      @tokens = db[:tokens]
+    end
+    def tags
+      # TODO create only in the context of linking
+      @tags ||= LookupCollection.new(db[:tags], :tag)
+    end
+    def words
+      @words ||= LookupCollection.new(db[:words], :word)
+    end
+    def process(text_id)
+      return true unless tokens.where(:text_id => text_id).empty?
+      doc = Document.new(self)
+      result = yield(doc) if block_given?
+      doc.flush if result
+      result
+    end
+    def find_word(re)
+      find_ids(db[:words], :word => re)
+    end
+    def find_tag(re)
+      find_ids(db[:tags], :tag => re)
+    end
+    # kindly crafted by jeremyevans
+    def find(filters)
+      ds = db[:tokens.as(:t0)]
+      f = filters[0]
+      ds = ds.where(:t0__word_id=>f.word) if f.word
+      ds = ds.where(:t0__tag_id=>f.tag) if f.tag
+      i = 0
+      filters[1..-1].each do |f|
+        as = "t#{i+=1}"
+        h = {}
+        h[:"#{as}__word_id"] = f.word if f.word
+        h[:"#{as}__tag_id"] = f.tag if f.tag
+        ds = ds.join(:tokens.as(as)){ |j, lj, js| {:text_id.qualify(j) => :text_id.qualify(lj), :position.qualify(j) => :position.qualify(lj) + 1} }.where(h)
+      end
+      select = ds.select(:t0__text_id.as(:text_id), :t0__src_begin.as(:src_begin), :"t#{i}__src_end".as(:src_end),
+        :t0__tagged_begin.as(:tagged_begin), :"t#{i}__tagged_end".as(:tagged_end))
+      puts select.sql
+      puts select.explain
+      select.each do |e|
+        yield [e[:text_id], e[:src_begin], e[:src_end], e[:tagged_begin], e[:tagged_end]]
+      end
+    end
+    private
+    def find_ids(tbl, opts)
+      tbl.where(opts).select(:id).map{|e| e[:id]}
+    end
+  end
+end

data/lib/lumix/lookup_filter.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module Lumix
+  class LookupFilter
+    attr_reader :results, :filter
+    Filter = Struct.new(:word, :tag)
+    def initialize(lookup, filter, &result_proc)
+      @filter = filter
+      @result_proc = result_proc
+      @filters = create_filters(lookup, filter)
+      @results = 0
+    end
+    def <<(result)
+      @results += 1
+      @result_proc[*result] if @result_proc
+    end
+    def apply(lookup, &block)
+      lookup.find(@filters) do |range|
+        block[*range] if block and range
+      end
+    end
+    def create_filters(lookup, filter)
+      filter.scan(/(?:(?:\"([^\"]+)\")|(\S+))+/).map do |word, tag|
+        word_re = to_re(word)
+        tag_re = to_re(tag)
+        word_ids = lookup.find_word(word_re) if word_re
+        tag_ids = lookup.find_tag(tag_re) if tag_re
+        Filter.new(word_ids, tag_ids)
+      end
+    end
+    def to_re(txt)
+      return nil if txt.nil? || txt.empty?
+      Regexp.new('^' + txt.gsub(/\s/, '_').gsub(/\*/, '\S*').gsub(/\?/, '\S') + '$')
+    end
+  end
+end

data/lib/lumix/lookup_search.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require 'lumix/lookup_filter'
+require 'lumix/text_snippet'
+require 'lumix/lookup'
+module Lumix
+  class LookupSearch
+    TAGGED = /([^\s\|]+)\|(\S+)/m        # Xxx|YYY
+    def initialize(db, progress)
+      @lookup = Lookup.new(db)
+      @progress = progress
+    end
+    def concurrent_link?
+      true
+    end
+    def simulate!
+      @simulate = true
+    end
+    def link_text(id)
+      ds = TaggedText[id]
+      @lookup.process id do |doc|
+        result = true
+        file, text, tagged = ds.filename, ds.text, ds.tagged
+        puts "Linking text #{file}"
+        txt_pos = 0
+        position = 0
+        tagged.scan(TAGGED) do |word, tag|
+          tagged_begin = $~.begin(0)
+          tagged_end = $~.end(0)
+          # expand "x_y_z" notation to "x y  z"
+          word_re = Regexp.new(Regexp.escape(word).gsub(/\_/, '\s+'))
+          src_match = text[txt_pos..-1].match(word_re) # find the word
+          if src_match
+            offset = src_match.begin(0)
+            src_begin = txt_pos + offset
+            src_end = txt_pos + src_match.end(0)
+            txt_pos = src_end
+            unless @simulate
+              doc.add_token(id, position, word, tag, src_begin, src_end, tagged_begin, tagged_end)
+            end
+          else
+            STDERR.puts "Could not find match for '#{word}' in text #{file}"
+            STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
+            `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
+            result = nil
+            break
+          end
+          position += 1
+        end
+        result
+      end
+    rescue => e # TODO remove this crap
+      STDERR.puts e
+      STDERR.puts e.backtrace
+      raise e
+    end
+    def create_filter(f, &block)
+      Lumix::LookupFilter.new(@lookup, f, &block)
+    end
+    def find(*filters, &block)
+      p = Pool.new(4)
+      filters.flatten.each do |f|
+        p.schedule do
+          last_id = -1
+          t = nil
+          f.apply(@lookup) do |text_id, s_begin, s_end, t_begin, t_end|
+            t = TaggedText[text_id] if text_id != last_id
+            last_id = text_id
+            fname = File.basename(t.filename)
+            text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
+            tagged_snippet = Lumix::TextSnippet.new(fname, t.tagged, t_begin, t_end)
+            f << [text_snippet, tagged_snippet]
+          end
+        end
+      end
+      p.shutdown
+    end
+  end
+  SearchStrategy = LookupSearch
+end

data/lib/lumix/main.rb ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env jruby
+$: << File.join(File.dirname(__FILE__), '..')
+$: << File.join(File.dirname(__FILE__), '../../../Sweet/lib')
+require 'rubygems'
+require 'lumix/gui'

data/lib/lumix/model/base_models.rb ADDED Viewed

@@ -0,0 +1,35 @@
+class TaggedText
+  module InstanceMethods
+    include Enumerable
+    def create(attrs)
+      new(attrs).save_new
+    end
+    private
+    def accessor(*names)
+      names.each do |name|
+        define_method name do
+          @attrs[name]
+        end
+        define_method "#{name}=" do |v|
+          @attrs[name] = v
+        end
+      end
+    end
+  end
+  extend InstanceMethods
+  def initialize(attrs)
+    @id = attrs.delete(:id)
+    @attrs = attrs
+  end
+  attr_reader :id
+  accessor :text, :tagged, :fulltagged, :filename, :tagged_filename, :digest
+  def update(attrs)
+    @attrs.merge(attrs)
+    save
+  end
+end

data/lib/lumix/model/maglev_models.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require 'lumix/model/base_models'
+class TaggedText
+  def save
+    Maglev.commit_transaction
+  end
+  def save_new
+    self.table << self
+  end
+  class << self
+    def each(&block)
+      table.each &block
+    end
+    def [](key)
+      case key
+      when Hash
+        # find by values
+      when Integer
+        # find by id
+      when String
+        # find by filename
+      end
+    end
+    def exists?(attrs)
+    end
+    def ids
+    end
+    def count
+    end
+  end
+end

data/lib/lumix/model/mock_models.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'lumix/model/base_models'
+class TaggedText
+  def save
+    # data aware ;)
+  end
+  def save_new
+    self.class.table << self
+  end
+  class << self
+    def table
+      @@table ||= []
+    end
+    def each(&block)
+      table.each &block
+    end
+    def [](key)
+      case key
+      when Hash
+        # find by values
+      when Integer
+        table[key]
+      when String
+        # find by filename
+      end
+    end
+    def exists?(attrs)
+    end
+    def ids
+    end
+    def count
+    end
+  end
+end

data/lib/lumix/model/sequel_models.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require 'lumix/model/base_models'
+class TaggedText
+  def save
+    self.class.table.where(:id => @id).update(@attrs)
+  end
+  def save_new
+      @id = self.class.table.insert(@attrs)
+  end
+  class << self
+    attr_accessor :db
+    def each(&block)
+      p = Pool.new(4)
+      table.select(:id).each do |id|
+        p.schedule{block.call self[id[:id]]}
+      end
+      p.shutdown
+    end
+    def table
+      db[:texts]
+    end
+    def [](key)
+      data = case key
+      when Hash
+        table[key]
+      when Integer
+        table[:id => key]
+      when String
+        table[:filename => key]
+      end
+      new data if data
+    end
+    def exists?(attrs)
+      table.where(attrs).count != 0
+    end
+    def ids
+      table.select(:id).map{|v| v[:id]}
+    end
+    def count
+      table.count
+    end
+  end
+end