RubyGems - lumix - Versions diffs - 0.0.2-java - Mend

lumix 0.0.2-java

Files changed (35) hide show

data/COPYING +18 -0
data/bin/lumix +4 -0
data/bin/lumix-gui +4 -0
data/lib/lumix/base.rb +56 -0
data/lib/lumix/charset.rb +35 -0
data/lib/lumix/cli.rb +96 -0
data/lib/lumix/concordancer.rb +254 -0
data/lib/lumix/corrections.rb +84 -0
data/lib/lumix/fast_search.rb +91 -0
data/lib/lumix/filter.rb +89 -0
data/lib/lumix/gui.rb +148 -0
data/lib/lumix/lookup.rb +105 -0
data/lib/lumix/lookup_filter.rb +43 -0
data/lib/lumix/lookup_search.rb +95 -0
data/lib/lumix/main.rb +7 -0
data/lib/lumix/model/base_models.rb +35 -0
data/lib/lumix/model/maglev_models.rb +42 -0
data/lib/lumix/model/mock_models.rb +46 -0
data/lib/lumix/model/sequel_models.rb +53 -0
data/lib/lumix/proto/lookup.rb +105 -0
data/lib/lumix/proto/lookup_filter.rb +40 -0
data/lib/lumix/proto/lookup_search.rb +81 -0
data/lib/lumix/result_view.rb +93 -0
data/lib/lumix/schema/001_create_tables.rb +35 -0
data/lib/lumix/schema/002_categories.rb +28 -0
data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
data/lib/lumix/slow_search.rb +104 -0
data/lib/lumix/text_snippet.rb +29 -0
data/lib/lumix/textprocessing.rb +108 -0
data/lib/lumix/thread_pool.rb +127 -0
data/spec/filter_spec.rb +55 -0
data/spec/lookup_spec.rb +70 -0
data/spec/text_snippet_spec.rb +55 -0
metadata +175 -0

data/COPYING ADDED Viewed

@@ -0,0 +1,18 @@
+Copyright (c) 2010 Michael Klaus
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/bin/lumix ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env jruby -Eutf-8:utf-8 -Ku -U -J-Xmx1024m
+require 'rubygems'
+require 'lumix/cli'

data/bin/lumix-gui ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env jruby
+require 'rubygems'
+require 'lumix/gui'

data/lib/lumix/base.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'yaml'
+module Lumix
+  Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
+  CONF = 'config.yaml'
+  ConfigStruct = Struct.new(:database_uri)
+  CConfig = if File.exists?(CONF)
+    YAML.load_file(CONF)
+  else
+    conf = ConfigStruct.new('jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer')
+    File.open(CONF, 'w') do |f|
+      f.write(conf.to_yaml)
+    end
+    conf
+  end
+  def conc
+    @conc ||= create_concordancer
+  end
+  def import_files(lang, *path)
+    conc.tp.lang = lang
+    conc.read(path)
+  end
+  def relink
+    conc.link!
+  end
+  def simulate_link
+    conc.simulate!
+    conc.link!
+  end
+  def link
+    conc.link
+  end
+  def reconnect(opts = {})
+    @conc = create_concordancer(opts)
+  end
+  def correct(*ids)
+    conc.correct *ids
+  end
+  def to_filename(filter)
+    filter.gsub(/\s+/, "_").gsub(/[\.\"]/, '')
+  end
+  def create_concordancer(opts = {})
+    Concordancer.new(CConfig.database_uri, opts.merge(:progress_proc => progress_proc))
+  end
+end
+require 'lumix/concordancer'

data/lib/lumix/charset.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'ffi-icu'
+require 'iconv'
+require 'htmlentities'
+class String
+  NoMatchFound = Class.new(Exception)
+  def to_utf(default = 'utf-8')
+    @icu ||= ICU::CharDet::Detector.new
+    result = icu_return(default) || find_icu
+    raise NoMatchFound unless result
+    @entities ||= HTMLEntities.new
+    @entities.decode(result)
+  end
+  def find_icu
+    matches = @icu.detect_all(self)
+    matches.each do |match|
+      if d = icu_return(match.name)
+        return d
+      end
+    end
+    return nil
+  end
+  def icu_return(cs)
+    begin
+      return Iconv.conv('UTF-8', cs, self)
+    rescue
+    end
+  end
+end

data/lib/lumix/cli.rb ADDED Viewed

@@ -0,0 +1,96 @@
+require 'lumix/base'
+include Lumix
+def help
+  puts "lumix-cli import <en|ro> <path>"
+  puts "lumix-cli [search] 'search string' ..."
+  puts "lumix-cli relink"
+  exit
+end
+def search(*filters)
+  files = []
+  fs = filters.map do |filt|
+    file = create_findings_file(filt)
+    next unless file
+    files << file
+    conc.create_filter(filt) do |text, tagged|
+      file.puts "#{text.name}: #{text.left} | #{tagged.to_s} | #{text.right}"
+      #file.puts "#{text.name}: #{tagged.to_s}"
+    end
+  end.compact
+  conc.find(fs) unless fs.empty?
+  fs.each do |f|
+    puts "Found #{f.results == 0 ? 'no' : f.results} matches for #{f.filter}"
+  end
+ensure
+  files.each{ |f| f.close }
+end
+def create_findings_file(filter, filename = to_filename(filter), &block)
+  if File.exists?(filename)
+    puts "File #{filename} already exists! Ignoring."
+  else
+    File.open(filename, 'w', &block)
+  end
+end
+def tag(lang, file)
+  conc.tp.lang = lang
+  puts conc.tp.process(File.read(file))
+end
+def import!(lang, *files)
+  conc.link_on_import!
+  import_files(lang, *files)
+end
+def tag(lang, *files)
+  p = Pool.new(10)
+  conc.tp.lang = lang
+  conc.tp.to_filelist(files).each do |file|
+    p.schedule do
+      tagged = conc.tp.create_tagged_filename(file)
+      conc.tp.process_file(file, tagged) unless File.exists?(tagged)
+    end
+  end
+  p.shutdown
+end
+private
+def progress_proc
+  task = nil
+  percent = 0
+  proc do |p|
+    if !task or p.task != task
+      task = p.task
+      percent = 0
+      puts Texts[task] || task
+    end
+    if p.done == p.work
+      puts "Done"
+    else
+      new_percent = (100 * p.done / p.work).to_i
+      if new_percent > percent
+        print "." * ((new_percent - percent) / 2)
+        percent = new_percent
+      end
+    end
+  end
+end
+cmd, *args = ARGV
+if !cmd
+  #help
+  cmd, *args = 'search', 'N "de" N'
+end
+c = cmd.downcase.to_sym
+cmd = :help if c =~ /^-{1,2}help$/i
+cmd = :search if !respond_to?(c)
+send c, *args

data/lib/lumix/concordancer.rb ADDED Viewed

@@ -0,0 +1,254 @@
+#!/bin/env ruby
+# TODO take care of 's problem
+# TODO remove Word count line
+require 'rubygems'
+require 'digest/md5'
+require 'sequel'
+require 'sequel/extensions/migration'
+require 'lumix/model/sequel_models'
+require 'lumix/thread_pool'
+require 'lumix/textprocessing'
+require 'lumix/lookup_search'
+#require 'lumix/fast_search'
+module Lumix
+  WORKERS = (ENV['LUMIX_WORKERS'] || 20).to_i
+  RELINK = ENV['LUMIX_RELINK']
+  DB_VERSION = 4
+  class ::String
+    def digest
+      return @digest if @digest
+      digest = Digest::MD5.new
+      digest.update self
+      @digest = digest.hexdigest
+    end
+  end
+  Progress = Struct.new(:task, :work, :data, :done)
+  class Concordancer
+    class << self
+    end
+    attr_reader :db, :tp
+    attr_accessor :progress_proc
+    attr_writer :link_on_import
+    def initialize(db_uri, options = {})
+      @progress_proc = options[:progress_proc]
+      @db = connect(db_uri)
+      if options[:recreate]
+        db.tables.each{ |t| db.drop_table t }
+        migrate(db)
+      end
+      @ids = all
+      @tp = TextProcessing.new
+    end
+    def strategy
+      @strategy ||= SearchStrategy.new(@db, @progress_proc)
+    end
+    def create_link_pool
+      Pool.new(strategy.concurrent_link? ? 4 : 1)
+    end
+    def link_on_import?
+      @link_on_import
+    end
+    def link_on_import!
+      @link_on_import = true
+    end
+    def get_id(file)
+      text = File.read(file).to_utf
+      saved = TaggedText[:digest => text.digest]
+      saved ? saved.id : nil
+    end
+    def read(*files)
+      files = tp.to_filelist(*files)
+      prog = Progress.new(:read, files.size)
+      puts "Reading #{files.size} files"
+      @unprocessed = if File.exists?('unprocessed.lst')
+        File.readlines('unprocessed.lst').map(&:chomp)
+      else
+        []
+      end
+      File.open('unprocessed.lst', 'a') do |up|
+        l = create_link_pool
+        p = Pool.new(WORKERS)
+        l.schedule{ link! } if RELINK
+        files.each_with_index do |file, index|
+          if @unprocessed.member?(file)
+            puts "Ignoring #{file}"
+            next
+          end
+          p.schedule do
+            begin
+              id = read_file(file)
+              l.schedule { link id } if id and link_on_import?
+            rescue
+              puts "Error on file #{file}: #{$!}", $!.backtrace
+              @unprocessed << file
+              up.puts file
+            end
+            progress(prog, index + 1)
+          end
+        end
+        l.schedule { link } if link_on_import? # make sure everything is linked
+        p.shutdown
+        l.shutdown
+      end
+    end
+    def read_file(file)
+      text = File.read(file).to_utf
+      saved = TaggedText.exists?(:filename => file, :digest => text.digest)
+      unless saved
+        puts "Reading file #{file}"
+        # retrieve the tagged version
+        tagged_file = tp.create_tagged_filename(file)
+        tagged = if File.exists?(tagged_file)
+          File.read(tagged_file)
+        else
+          tagged = tp.process(text)
+          File.open(tagged_file, 'w') do |out|
+            out.write tagged
+          end
+          tagged
+        end
+        retagged = retag(tagged)
+        tt = TaggedText.create(:digest => text.digest, :text => text, :tagged => retagged, :filename => file, :tagged_filename => tagged_file)
+        @ids << tt.id
+        yield tt if block_given?
+        tt
+      end
+    end
+    def correct(*ids)
+      ids = all if ids.empty?
+      ids.flatten.each do |id|
+        id = id.to_i
+        d = TaggedText[id]
+        next unless d
+        file = d.filename
+        text = File.read(file).to_utf
+        d.text = text
+        expected = text.digest
+        if d.digest != expected
+          puts "Correcting text #{file}"
+          d.digest = expected
+        end
+        d.save
+      end
+    end
+    def all
+      TaggedText.ids
+    end
+    def simulate!
+      strategy.simulate!
+    end
+    def link!(*ids)
+      link(*ids) do |ds|
+        ds.delete
+      end
+    end
+    def link(*ids)
+      ids = all if ids.empty?
+      ids.flatten!
+      prog = Progress.new(:link, ids.size)
+      progress(prog)
+      p = create_link_pool
+      ids.each_with_index do |id, index|
+        #ds = db[:assoc].filter(:text_id => id)
+        #yield ds if block_given?
+        # TODO implement force
+        p.schedule do
+          strategy.link_text(id) #if ds.empty?
+          progress(prog, index + 1)
+        end
+      end
+      p.shutdown
+    end
+    def create_filter(f, &block)
+      strategy.create_filter(f, &block)
+    end
+    def find(filters)
+      strategy.find(filters)
+    end
+    private
+    def connect(db_uri)
+      db = Sequel.connect(db_uri)
+      begin
+        db.get(1)
+      rescue Exception => e
+        puts 'Falling back to sqlite'
+        puts e
+        db = Sequel.connect('jdbc:sqlite://concordancer.db')
+      end
+      migrate(db)
+      TaggedText.db = db
+    end
+    def migrate(db)
+      migration_path = File.join(File.dirname(__FILE__), 'schema')
+      Sequel::Migrator.apply(db, migration_path, DB_VERSION)
+    end
+    def progress(prog, done = 0, data = prog.data)
+      if progress_proc
+        prog.done = done
+        prog.data = data
+        progress_proc.call(prog)
+      end
+    end
+    def retag(text)
+      chunks = text.split(/[ \n]/)
+      return text if (token = chunks.first.split(/\|/)).size != 4 # looks pre-retagged
+      tag_position = if token[2] =~ /\d+/ && token[3] =~ /\d+/ # looks like fulltagged
+        1
+      else
+        2
+      end
+      result = ''
+      chunks.each do |chunk|
+        next unless chunk.empty?
+        word, tag = chunk.split(/\|/)
+        result << ' ' unless result.empty?
+        result << "#{word}|#{tag[tag_position]}"
+      end
+      return result
+    end
+  end
+end

data/lib/lumix/corrections.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'lumix/charset'
+CORRECTIONS = <<-TXT
+catre | S
+fetite | NPRN
+in | S
+si | C
+circa | R
+fata de| S
+maxima | ASON
+inainte| R
+in materie de | R
+tin | V3
+beneficiaza | V3
+: | COLON
+ocupa | VN
+asigurata | VPSF
+mine | PPSA
+batut | VPSM
+insa | C
+impotriva | S
+americana | ASN
+caruia | R
+da | VN
+duce| VN
+primeasca | V3
+daca | C
+bulgara | ASN
+ramina | V3
+albaneza | ASN
+pina | S
+paraseasca | V3
+publica | ASN
+inceapa | V3
+ecologic | ASN
+internationala | ASN
+ecologista | ASN
+cada | V3
+linga | S
+adevaratele | APRY
+citiva | PI
+americana | ASN
+Miclici| NP
+fara | S
+cit | PI
+sugereaza | V3
+incasa | VN
+circa | R
+ghiceste | V3
+tarile |NPRY
+araba | ASN
+citeva | PI
+schimbindu | VG
+dupa | S
+uleiurilor_vegetale | NPOY
+botosaneana | ASN
+oricarui | PI
+TXT
+def corrections
+  @corrections ||= CORRECTIONS.split(/\n/).map do |line|
+    word, tag = line.split(/\|/).map(&:strip)
+    puts "Tagging #{word} as #{tag}"
+    [/\b#{word}\|\S+/, "#{word}\|#{tag}"]
+  end
+end
+def correct(t)
+  corrections.inject(t) do |result, (re, sub)|
+    result.gsub(re, sub)
+  end
+end
+def correct_all(path)
+  fs = Dir.glob(File.join(path, '*tagged*'))
+  fs.each do |fn|
+    t = correct(File.read(fn))
+    File.open(fn, 'w') { |f| f.print t }
+  end
+end
+if $0 == __FILE__
+  correct_all ARGV[0]
+end

data/lib/lumix/fast_search.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'lumix/filter'
+require 'lumix/text_snippet'
+module Lumix
+  class FastSearch
+    TAGGED = /([^\s\|]+)\|(\S+)/m        # Xxx|YYY
+    ORIG = /([^\|\s]*)\|([^\|\s]*)\|([^\|\s]*)\|(\S*)/ # X|Y|Z|W
+    def initialize(db, progress)
+      @db = db
+      @progress = progress
+    end
+    def concurrent_link?
+      true
+    end
+    def link_text(id)
+      ds = TaggedText[id]
+      return ds.fulltagged if ds.fulltagged
+      file, text, tagged = ds.filename, ds.text, ds.tagged
+      puts "Linking text #{file}"
+      txt_pos = 0
+      linked = ''
+      tagged.scan(TAGGED) do |word, tag|
+        tagged_begin = $~.begin(0)
+        # expand "x_y_z" notation to "x y z"
+        word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
+        src_match = text[txt_pos..-1].match(word_re) # find the word
+        if src_match
+          offset = src_match.begin(0)
+          src_begin = txt_pos + offset
+          src_end = txt_pos + src_match.end(0)
+          txt_pos = src_end
+          linked << ' ' unless linked.empty?
+          linked << word << '|' << tag << '|' << src_begin.to_s << '|' << src_end.to_s
+        else
+          STDERR.puts "Could not find match for '#{word}' in text #{file}"
+          STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
+          `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
+          return nil
+        end
+      end
+      unless linked.empty?
+        ds.fulltagged = linked
+        ds.save
+      end
+      return linked
+    rescue => e # TODO remove this crap
+      STDERR.puts e
+      STDERR.puts e.backtrace
+      raise e
+    end
+    def create_filter(f, &block)
+      Lumix::Filter.new('\|(\d+)\|(\d+)', f, &block)
+    end
+    def find(filters)
+      prog = Progress.new(:search, TaggedText.count, "", 0)
+      @progress[prog] if @progress
+      TaggedText.each_with_index do |t, i|
+        # matches to ranges
+        filters.each do |f|
+          f.scan(t.fulltagged) do |hit, t_begin, t_end, m|
+            s_begin = m.captures.first.to_i
+            s_end = m.captures.last.to_i
+            fname = File.basename(t.filename)
+            tagged_snippet = Lumix::TextSnippet.new(fname, t.fulltagged, t_begin, t_end)
+            text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
+            f << [text_snippet, tagged_snippet]
+          end
+        end
+        prog.done = i
+        @progress[prog] if @progress
+      end
+    end
+  end
+  SearchStrategy = FastSearch
+end