RubyGems - lumix - Versions diffs - 0.0.2-java - Mend

lumix 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/COPYING +18 -0
data/bin/lumix +4 -0
data/bin/lumix-gui +4 -0
data/lib/lumix/base.rb +56 -0
data/lib/lumix/charset.rb +35 -0
data/lib/lumix/cli.rb +96 -0
data/lib/lumix/concordancer.rb +254 -0
data/lib/lumix/corrections.rb +84 -0
data/lib/lumix/fast_search.rb +91 -0
data/lib/lumix/filter.rb +89 -0
data/lib/lumix/gui.rb +148 -0
data/lib/lumix/lookup.rb +105 -0
data/lib/lumix/lookup_filter.rb +43 -0
data/lib/lumix/lookup_search.rb +95 -0
data/lib/lumix/main.rb +7 -0
data/lib/lumix/model/base_models.rb +35 -0
data/lib/lumix/model/maglev_models.rb +42 -0
data/lib/lumix/model/mock_models.rb +46 -0
data/lib/lumix/model/sequel_models.rb +53 -0
data/lib/lumix/proto/lookup.rb +105 -0
data/lib/lumix/proto/lookup_filter.rb +40 -0
data/lib/lumix/proto/lookup_search.rb +81 -0
data/lib/lumix/result_view.rb +93 -0
data/lib/lumix/schema/001_create_tables.rb +35 -0
data/lib/lumix/schema/002_categories.rb +28 -0
data/lib/lumix/schema/003_add_fulltagged.rb +15 -0
data/lib/lumix/schema/004_create_lookup_tables.rb +44 -0
data/lib/lumix/slow_search.rb +104 -0
data/lib/lumix/text_snippet.rb +29 -0
data/lib/lumix/textprocessing.rb +108 -0
data/lib/lumix/thread_pool.rb +127 -0
data/spec/filter_spec.rb +55 -0
data/spec/lookup_spec.rb +70 -0
data/spec/text_snippet_spec.rb +55 -0
metadata +175 -0

data/COPYING ADDED Viewed

@@ -0,0 +1,18 @@
+Copyright (c) 2010 Michael Klaus
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/bin/lumix ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env jruby -Eutf-8:utf-8 -Ku -U -J-Xmx1024m
+require 'rubygems'
+require 'lumix/cli'

data/bin/lumix-gui ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env jruby
+require 'rubygems'
+require 'lumix/gui'

data/lib/lumix/base.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'yaml'
+module Lumix
+  Texts = {:search => "Searching...", :read => "Importing files", :link => "Linking texts"}
+  CONF = 'config.yaml'
+  ConfigStruct = Struct.new(:database_uri)
+  CConfig = if File.exists?(CONF)
+    YAML.load_file(CONF)
+  else
+    conf = ConfigStruct.new('jdbc:postgresql://localhost:5433/concordancer?user=concordancer&password=concordancer')
+    File.open(CONF, 'w') do |f|
+      f.write(conf.to_yaml)
+    end
+    conf
+  end
+  def conc
+    @conc ||= create_concordancer
+  end
+  def import_files(lang, *path)
+    conc.tp.lang = lang
+    conc.read(path)
+  end
+  def relink
+    conc.link!
+  end
+  def simulate_link
+    conc.simulate!
+    conc.link!
+  end
+  def link
+    conc.link
+  end
+  def reconnect(opts = {})
+    @conc = create_concordancer(opts)
+  end
+  def correct(*ids)
+    conc.correct *ids
+  end
+  def to_filename(filter)
+    filter.gsub(/\s+/, "_").gsub(/[\.\"]/, '')
+  end
+  def create_concordancer(opts = {})
+    Concordancer.new(CConfig.database_uri, opts.merge(:progress_proc => progress_proc))
+  end
+end
+require 'lumix/concordancer'

data/lib/lumix/charset.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'ffi-icu'
+require 'iconv'
+require 'htmlentities'
+class String
+  NoMatchFound = Class.new(Exception)
+  def to_utf(default = 'utf-8')
+    @icu ||= ICU::CharDet::Detector.new
+    result = icu_return(default) || find_icu
+    raise NoMatchFound unless result
+    @entities ||= HTMLEntities.new
+    @entities.decode(result)
+  end
+  def find_icu
+    matches = @icu.detect_all(self)
+    matches.each do |match|
+      if d = icu_return(match.name)
+        return d
+      end
+    end
+    return nil
+  end
+  def icu_return(cs)
+    begin
+      return Iconv.conv('UTF-8', cs, self)
+    rescue
+    end
+  end
+end

data/lib/lumix/cli.rb ADDED Viewed

@@ -0,0 +1,96 @@
+require 'lumix/base'
+include Lumix
+def help
+  puts "lumix-cli import <en|ro> <path>"
+  puts "lumix-cli [search] 'search string' ..."
+  puts "lumix-cli relink"
+  exit
+end
+def search(*filters)
+  files = []
+  fs = filters.map do |filt|
+    file = create_findings_file(filt)
+    next unless file
+    files << file
+    conc.create_filter(filt) do |text, tagged|
+      file.puts "#{text.name}: #{text.left} | #{tagged.to_s} | #{text.right}"
+      #file.puts "#{text.name}: #{tagged.to_s}"
+    end
+  end.compact
+  conc.find(fs) unless fs.empty?
+  fs.each do |f|
+    puts "Found #{f.results == 0 ? 'no' : f.results} matches for #{f.filter}"
+  end
+ensure
+  files.each{ |f| f.close }
+end
+def create_findings_file(filter, filename = to_filename(filter), &block)
+  if File.exists?(filename)
+    puts "File #{filename} already exists! Ignoring."
+  else
+    File.open(filename, 'w', &block)
+  end
+end
+def tag(lang, file)
+  conc.tp.lang = lang
+  puts conc.tp.process(File.read(file))
+end
+def import!(lang, *files)
+  conc.link_on_import!
+  import_files(lang, *files)
+end
+def tag(lang, *files)
+  p = Pool.new(10)
+  conc.tp.lang = lang
+  conc.tp.to_filelist(files).each do |file|
+    p.schedule do
+      tagged = conc.tp.create_tagged_filename(file)
+      conc.tp.process_file(file, tagged) unless File.exists?(tagged)
+    end
+  end
+  p.shutdown
+end
+private
+def progress_proc
+  task = nil
+  percent = 0
+  proc do |p|
+    if !task or p.task != task
+      task = p.task
+      percent = 0
+      puts Texts[task] || task
+    end
+    if p.done == p.work
+      puts "Done"
+    else
+      new_percent = (100 * p.done / p.work).to_i
+      if new_percent > percent
+        print "." * ((new_percent - percent) / 2)
+        percent = new_percent
+      end
+    end
+  end
+end
+cmd, *args = ARGV
+if !cmd
+  #help
+  cmd, *args = 'search', 'N "de" N'
+end
+c = cmd.downcase.to_sym
+cmd = :help if c =~ /^-{1,2}help$/i
+cmd = :search if !respond_to?(c)
+send c, *args

data/lib/lumix/concordancer.rb ADDED Viewed

@@ -0,0 +1,254 @@
+#!/bin/env ruby
+# TODO take care of 's problem
+# TODO remove Word count line
+require 'rubygems'
+require 'digest/md5'
+require 'sequel'
+require 'sequel/extensions/migration'
+require 'lumix/model/sequel_models'
+require 'lumix/thread_pool'
+require 'lumix/textprocessing'
+require 'lumix/lookup_search'
+#require 'lumix/fast_search'
+module Lumix
+  WORKERS = (ENV['LUMIX_WORKERS'] || 20).to_i
+  RELINK = ENV['LUMIX_RELINK']
+  DB_VERSION = 4
+  class ::String
+    def digest
+      return @digest if @digest
+      digest = Digest::MD5.new
+      digest.update self
+      @digest = digest.hexdigest
+    end
+  end
+  Progress = Struct.new(:task, :work, :data, :done)
+  class Concordancer
+    class << self
+    end
+    attr_reader :db, :tp
+    attr_accessor :progress_proc
+    attr_writer :link_on_import
+    def initialize(db_uri, options = {})
+      @progress_proc = options[:progress_proc]
+      @db = connect(db_uri)
+      if options[:recreate]
+        db.tables.each{ |t| db.drop_table t }
+        migrate(db)
+      end
+      @ids = all
+      @tp = TextProcessing.new
+    end
+    def strategy
+      @strategy ||= SearchStrategy.new(@db, @progress_proc)
+    end
+    def create_link_pool
+      Pool.new(strategy.concurrent_link? ? 4 : 1)
+    end
+    def link_on_import?
+      @link_on_import
+    end
+    def link_on_import!
+      @link_on_import = true
+    end
+    def get_id(file)
+      text = File.read(file).to_utf
+      saved = TaggedText[:digest => text.digest]
+      saved ? saved.id : nil
+    end
+    def read(*files)
+      files = tp.to_filelist(*files)
+      prog = Progress.new(:read, files.size)
+      puts "Reading #{files.size} files"
+      @unprocessed = if File.exists?('unprocessed.lst')
+        File.readlines('unprocessed.lst').map(&:chomp)
+      else
+        []
+      end
+      File.open('unprocessed.lst', 'a') do |up|
+        l = create_link_pool
+        p = Pool.new(WORKERS)
+        l.schedule{ link! } if RELINK
+        files.each_with_index do |file, index|
+          if @unprocessed.member?(file)
+            puts "Ignoring #{file}"
+            next
+          end
+          p.schedule do
+            begin
+              id = read_file(file)
+              l.schedule { link id } if id and link_on_import?
+            rescue
+              puts "Error on file #{file}: #{$!}", $!.backtrace
+              @unprocessed << file
+              up.puts file
+            end
+            progress(prog, index + 1)
+          end
+        end
+        l.schedule { link } if link_on_import? # make sure everything is linked
+        p.shutdown
+        l.shutdown
+      end
+    end
+    def read_file(file)
+      text = File.read(file).to_utf
+      saved = TaggedText.exists?(:filename => file, :digest => text.digest)
+      unless saved
+        puts "Reading file #{file}"
+        # retrieve the tagged version
+        tagged_file = tp.create_tagged_filename(file)
+        tagged = if File.exists?(tagged_file)
+          File.read(tagged_file)
+        else
+          tagged = tp.process(text)
+          File.open(tagged_file, 'w') do |out|
+            out.write tagged
+          end
+          tagged
+        end
+        retagged = retag(tagged)
+        tt = TaggedText.create(:digest => text.digest, :text => text, :tagged => retagged, :filename => file, :tagged_filename => tagged_file)
+        @ids << tt.id
+        yield tt if block_given?
+        tt
+      end
+    end
+    def correct(*ids)
+      ids = all if ids.empty?
+      ids.flatten.each do |id|
+        id = id.to_i
+        d = TaggedText[id]
+        next unless d
+        file = d.filename
+        text = File.read(file).to_utf
+        d.text = text
+        expected = text.digest
+        if d.digest != expected
+          puts "Correcting text #{file}"
+          d.digest = expected
+        end
+        d.save
+      end
+    end
+    def all
+      TaggedText.ids
+    end
+    def simulate!
+      strategy.simulate!
+    end
+    def link!(*ids)
+      link(*ids) do |ds|
+        ds.delete
+      end
+    end
+    def link(*ids)
+      ids = all if ids.empty?
+      ids.flatten!
+      prog = Progress.new(:link, ids.size)
+      progress(prog)
+      p = create_link_pool
+      ids.each_with_index do |id, index|
+        #ds = db[:assoc].filter(:text_id => id)
+        #yield ds if block_given?
+        # TODO implement force
+        p.schedule do
+          strategy.link_text(id) #if ds.empty?
+          progress(prog, index + 1)
+        end
+      end
+      p.shutdown
+    end
+    def create_filter(f, &block)
+      strategy.create_filter(f, &block)
+    end
+    def find(filters)
+      strategy.find(filters)
+    end
+    private
+    def connect(db_uri)
+      db = Sequel.connect(db_uri)
+      begin
+        db.get(1)
+      rescue Exception => e
+        puts 'Falling back to sqlite'
+        puts e
+        db = Sequel.connect('jdbc:sqlite://concordancer.db')
+      end
+      migrate(db)
+      TaggedText.db = db
+    end
+    def migrate(db)
+      migration_path = File.join(File.dirname(__FILE__), 'schema')
+      Sequel::Migrator.apply(db, migration_path, DB_VERSION)
+    end
+    def progress(prog, done = 0, data = prog.data)
+      if progress_proc
+        prog.done = done
+        prog.data = data
+        progress_proc.call(prog)
+      end
+    end
+    def retag(text)
+      chunks = text.split(/[ \n]/)
+      return text if (token = chunks.first.split(/\|/)).size != 4 # looks pre-retagged
+      tag_position = if token[2] =~ /\d+/ && token[3] =~ /\d+/ # looks like fulltagged
+        1
+      else
+        2
+      end
+      result = ''
+      chunks.each do |chunk|
+        next unless chunk.empty?
+        word, tag = chunk.split(/\|/)
+        result << ' ' unless result.empty?
+        result << "#{word}|#{tag[tag_position]}"
+      end
+      return result
+    end
+  end
+end

data/lib/lumix/corrections.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'lumix/charset'
+CORRECTIONS = <<-TXT
+catre | S
+fetite | NPRN
+in | S
+si | C
+circa | R
+fata de| S
+maxima | ASON
+inainte| R
+in materie de | R
+tin | V3
+beneficiaza | V3
+: | COLON
+ocupa | VN
+asigurata | VPSF
+mine | PPSA
+batut | VPSM
+insa | C
+impotriva | S
+americana | ASN
+caruia | R
+da | VN
+duce| VN
+primeasca | V3
+daca | C
+bulgara | ASN
+ramina | V3
+albaneza | ASN
+pina | S
+paraseasca | V3
+publica | ASN
+inceapa | V3
+ecologic | ASN
+internationala | ASN
+ecologista | ASN
+cada | V3
+linga | S
+adevaratele | APRY
+citiva | PI
+americana | ASN
+Miclici| NP
+fara | S
+cit | PI
+sugereaza | V3
+incasa | VN
+circa | R
+ghiceste | V3
+tarile |NPRY
+araba | ASN
+citeva | PI
+schimbindu | VG
+dupa | S
+uleiurilor_vegetale | NPOY
+botosaneana | ASN
+oricarui | PI
+TXT
+def corrections
+  @corrections ||= CORRECTIONS.split(/\n/).map do |line|
+    word, tag = line.split(/\|/).map(&:strip)
+    puts "Tagging #{word} as #{tag}"
+    [/\b#{word}\|\S+/, "#{word}\|#{tag}"]
+  end
+end
+def correct(t)
+  corrections.inject(t) do |result, (re, sub)|
+    result.gsub(re, sub)
+  end
+end
+def correct_all(path)
+  fs = Dir.glob(File.join(path, '*tagged*'))
+  fs.each do |fn|
+    t = correct(File.read(fn))
+    File.open(fn, 'w') { |f| f.print t }
+  end
+end
+if $0 == __FILE__
+  correct_all ARGV[0]
+end

data/lib/lumix/fast_search.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'lumix/filter'
+require 'lumix/text_snippet'
+module Lumix
+  class FastSearch
+    TAGGED = /([^\s\|]+)\|(\S+)/m        # Xxx|YYY
+    ORIG = /([^\|\s]*)\|([^\|\s]*)\|([^\|\s]*)\|(\S*)/ # X|Y|Z|W
+    def initialize(db, progress)
+      @db = db
+      @progress = progress
+    end
+    def concurrent_link?
+      true
+    end
+    def link_text(id)
+      ds = TaggedText[id]
+      return ds.fulltagged if ds.fulltagged
+      file, text, tagged = ds.filename, ds.text, ds.tagged
+      puts "Linking text #{file}"
+      txt_pos = 0
+      linked = ''
+      tagged.scan(TAGGED) do |word, tag|
+        tagged_begin = $~.begin(0)
+        # expand "x_y_z" notation to "x y z"
+        word_re = Regexp.new(Regexp.escape(word).gsub(/_/, '\s*'))
+        src_match = text[txt_pos..-1].match(word_re) # find the word
+        if src_match
+          offset = src_match.begin(0)
+          src_begin = txt_pos + offset
+          src_end = txt_pos + src_match.end(0)
+          txt_pos = src_end
+          linked << ' ' unless linked.empty?
+          linked << word << '|' << tag << '|' << src_begin.to_s << '|' << src_end.to_s
+        else
+          STDERR.puts "Could not find match for '#{word}' in text #{file}"
+          STDERR.puts text[(txt_pos-10)..(txt_pos+word.size+10)]
+          `echo '#{file}:#{txt_pos}:#{tagged_begin} unmatched "#{word}"' >> unlinked.lst`
+          return nil
+        end
+      end
+      unless linked.empty?
+        ds.fulltagged = linked
+        ds.save
+      end
+      return linked
+    rescue => e # TODO remove this crap
+      STDERR.puts e
+      STDERR.puts e.backtrace
+      raise e
+    end
+    def create_filter(f, &block)
+      Lumix::Filter.new('\|(\d+)\|(\d+)', f, &block)
+    end
+    def find(filters)
+      prog = Progress.new(:search, TaggedText.count, "", 0)
+      @progress[prog] if @progress
+      TaggedText.each_with_index do |t, i|
+        # matches to ranges
+        filters.each do |f|
+          f.scan(t.fulltagged) do |hit, t_begin, t_end, m|
+            s_begin = m.captures.first.to_i
+            s_end = m.captures.last.to_i
+            fname = File.basename(t.filename)
+            tagged_snippet = Lumix::TextSnippet.new(fname, t.fulltagged, t_begin, t_end)
+            text_snippet = Lumix::TextSnippet.new(fname, t.text, s_begin, s_end)
+            f << [text_snippet, tagged_snippet]
+          end
+        end
+        prog.done = i
+        @progress[prog] if @progress
+      end
+    end
+  end
+  SearchStrategy = FastSearch
+end