xapian_db 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +14 -0
- data/README.rdoc +8 -4
- data/lib/xapian_db/config.rb +9 -12
- data/lib/xapian_db/database.rb +1 -0
- data/lib/xapian_db/document_blueprint.rb +3 -8
- data/lib/xapian_db/index_writers/direct_writer.rb +13 -6
- data/lib/xapian_db/indexer.rb +25 -30
- data/lib/xapian_db/query_parser.rb +16 -5
- data/lib/xapian_db/repositories/stemmer.rb +38 -0
- data/lib/xapian_db/repositories/stopper.rb +45 -0
- data/lib/xapian_db/resultset.rb +7 -1
- data/lib/xapian_db/stopwords/README +5 -0
- data/lib/xapian_db/stopwords/da.txt +94 -0
- data/lib/xapian_db/stopwords/de.txt +231 -0
- data/lib/xapian_db/stopwords/en.txt +174 -0
- data/lib/xapian_db/stopwords/es.txt +308 -0
- data/lib/xapian_db/stopwords/fi.txt +68 -0
- data/lib/xapian_db/stopwords/fr.txt +163 -0
- data/lib/xapian_db/stopwords/hu.txt +199 -0
- data/lib/xapian_db/stopwords/it.txt +279 -0
- data/lib/xapian_db/stopwords/nl.txt +101 -0
- data/lib/xapian_db/stopwords/no.txt +176 -0
- data/lib/xapian_db/stopwords/pt.txt +203 -0
- data/lib/xapian_db/stopwords/ru.txt +159 -0
- data/lib/xapian_db/stopwords/sv.txt +114 -0
- data/lib/xapian_db/stopwords/update_stopwords.rb +35 -0
- data/lib/xapian_db.rb +25 -14
- metadata +20 -3
@@ -0,0 +1,35 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
|
3
|
+
LANGUAGE_MAP = {:danish => :da,
|
4
|
+
:dutch => :nl,
|
5
|
+
:english => :en,
|
6
|
+
:finnish => :fi,
|
7
|
+
:french => :fr,
|
8
|
+
:german => :de,
|
9
|
+
:hungarian => :hu,
|
10
|
+
:italian => :it,
|
11
|
+
:norwegian => :no,
|
12
|
+
:portuguese => :pt,
|
13
|
+
:russian => :ru,
|
14
|
+
:spanish => :es,
|
15
|
+
:swedish => :sv}
|
16
|
+
|
17
|
+
# 1. Load the stop words files from snowball.tartarus.org
|
18
|
+
LANGUAGE_MAP.keys.reject{|k| k == :russian}.each { |l| system("curl http://snowball.tartarus.org/algorithms/%s/stop.txt | iconv -f ISO-8859-1 -t UTF-8 > %s.txt" % [l, l]) }
|
19
|
+
system("curl http://snowball.tartarus.org/algorithms/russian/stop.txt | iconv -f KOI8-R -t UTF-8 > russian.txt")
|
20
|
+
|
21
|
+
# 2. Clean up the files (remove comments) and write a new file with the iso name
|
22
|
+
LANGUAGE_MAP.keys.each do |lang|
|
23
|
+
open("#{LANGUAGE_MAP[lang]}.txt", "w") do |outfile|
|
24
|
+
open("#{lang}.txt", "r") do |infile|
|
25
|
+
while line = infile.gets
|
26
|
+
outfile.puts line.split(" ", 2).first.downcase.strip unless line =~ /^ +|^$|^\|/
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# 3. Remove the downloaded files
|
33
|
+
LANGUAGE_MAP.keys.each {|lang| FileUtils.rm_rf "#{lang}.txt"}
|
34
|
+
|
35
|
+
|
data/lib/xapian_db.rb
CHANGED
@@ -1,17 +1,35 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'digest/sha1'
|
4
|
-
require 'xapian'
|
5
|
-
require 'yaml'
|
6
|
-
|
7
3
|
# This is the top level module of xapian_db. It allows you to
|
8
4
|
# configure XapianDB, create / open databases and perform
|
9
5
|
# searches.
|
10
6
|
|
11
7
|
# @author Gernot Kogler
|
12
8
|
|
9
|
+
require 'xapian'
|
10
|
+
require 'yaml'
|
11
|
+
|
13
12
|
module XapianDb
|
14
13
|
|
14
|
+
# Supported languages
|
15
|
+
LANGUAGE_MAP = {:da => :danish,
|
16
|
+
:nl => :dutch,
|
17
|
+
:en => :english,
|
18
|
+
:fi => :finnish,
|
19
|
+
:fr => :french,
|
20
|
+
:de => :german2, # Normalises umlauts and ß
|
21
|
+
:hu => :hungarian,
|
22
|
+
:it => :italian,
|
23
|
+
:nb => :norwegian,
|
24
|
+
:nn => :norwegian,
|
25
|
+
:no => :norwegian,
|
26
|
+
:pt => :portuguese,
|
27
|
+
:ro => :romanian,
|
28
|
+
:ru => :russian,
|
29
|
+
:es => :spanish,
|
30
|
+
:sv => :swedish,
|
31
|
+
:tr => :turkish}
|
32
|
+
|
15
33
|
# Global configuration for XapianDb. See {XapianDb::Config.setup}
|
16
34
|
# for available options
|
17
35
|
def self.setup(&block)
|
@@ -61,16 +79,9 @@ module XapianDb
|
|
61
79
|
|
62
80
|
end
|
63
81
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
require File.dirname(__FILE__) + '/xapian_db/adapters/active_record_adapter'
|
68
|
-
require File.dirname(__FILE__) + '/xapian_db/index_writers/direct_writer'
|
69
|
-
require File.dirname(__FILE__) + '/xapian_db/database'
|
70
|
-
require File.dirname(__FILE__) + '/xapian_db/document_blueprint'
|
71
|
-
require File.dirname(__FILE__) + '/xapian_db/indexer'
|
72
|
-
require File.dirname(__FILE__) + '/xapian_db/query_parser'
|
73
|
-
require File.dirname(__FILE__) + '/xapian_db/resultset'
|
82
|
+
do_not_require = %w(update_stopwords.rb railtie.rb)
|
83
|
+
files = Dir.glob("#{File.dirname(__FILE__)}/**/*.rb").reject{|path| do_not_require.include?(File.basename(path))}
|
84
|
+
files.each {|file| require file}
|
74
85
|
|
75
86
|
# Configure XapianDB if we are in a Rails app
|
76
87
|
require File.dirname(__FILE__) + '/xapian_db/railtie' if defined?(Rails)
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 3
|
9
|
+
version: 0.3.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Gernot Kogler
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-13 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -66,7 +66,24 @@ files:
|
|
66
66
|
- lib/xapian_db/indexer.rb
|
67
67
|
- lib/xapian_db/query_parser.rb
|
68
68
|
- lib/xapian_db/railtie.rb
|
69
|
+
- lib/xapian_db/repositories/stemmer.rb
|
70
|
+
- lib/xapian_db/repositories/stopper.rb
|
69
71
|
- lib/xapian_db/resultset.rb
|
72
|
+
- lib/xapian_db/stopwords/da.txt
|
73
|
+
- lib/xapian_db/stopwords/de.txt
|
74
|
+
- lib/xapian_db/stopwords/en.txt
|
75
|
+
- lib/xapian_db/stopwords/es.txt
|
76
|
+
- lib/xapian_db/stopwords/fi.txt
|
77
|
+
- lib/xapian_db/stopwords/fr.txt
|
78
|
+
- lib/xapian_db/stopwords/hu.txt
|
79
|
+
- lib/xapian_db/stopwords/it.txt
|
80
|
+
- lib/xapian_db/stopwords/nl.txt
|
81
|
+
- lib/xapian_db/stopwords/no.txt
|
82
|
+
- lib/xapian_db/stopwords/pt.txt
|
83
|
+
- lib/xapian_db/stopwords/README
|
84
|
+
- lib/xapian_db/stopwords/ru.txt
|
85
|
+
- lib/xapian_db/stopwords/sv.txt
|
86
|
+
- lib/xapian_db/stopwords/update_stopwords.rb
|
70
87
|
- lib/xapian_db.rb
|
71
88
|
- LICENSE
|
72
89
|
- README.rdoc
|