crm114 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +3 -0
- data/LICENSE +19 -0
- data/Manifest.txt +8 -0
- data/README +59 -0
- data/Rakefile +41 -0
- data/lib/crm114.rb +88 -0
- data/test/test_code_or_text.rb +29 -0
- data/test/test_crm114.rb +61 -0
- metadata +63 -0
data/CHANGELOG
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2005-2006 Arto Bendiken <http://bendiken.net/>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
18
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
19
|
+
IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
data/README
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
== CRM114 Controllable Regex Mutilator for Ruby
|
2
|
+
|
3
|
+
This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an
|
4
|
+
advanced and fast text classifier that uses sparse binary polynomial
|
5
|
+
matching with a Bayesian Chain Rule evaluator and a hidden Markov model to
|
6
|
+
categorize data with up to a 99.87% accuracy.
|
7
|
+
|
8
|
+
The Ruby wrapper grew out of this:
|
9
|
+
* http://bendiken.net/2006/07/05/spam-filters-alien-technology-and-ruby-on-rails
|
10
|
+
|
11
|
+
=== About CRM114
|
12
|
+
|
13
|
+
* http://crm114.sourceforge.net
|
14
|
+
* http://en.wikipedia.org/wiki/CRM114
|
15
|
+
* http://en.wikipedia.org/wiki/Dr_Strangelove
|
16
|
+
* http://www.paulgraham.com/wsy.html
|
17
|
+
|
18
|
+
== Download
|
19
|
+
|
20
|
+
* http://rubyforge.org/projects/crm114
|
21
|
+
* gem install crm114
|
22
|
+
* svn checkout svn://rubyforge.org/var/svn/crm114
|
23
|
+
|
24
|
+
== Dependencies
|
25
|
+
|
26
|
+
Requires the CRM114 binaries to be installed. Specifically, the '+crm+'
|
27
|
+
binary should be accessible in the current user's PATH environment variable.
|
28
|
+
|
29
|
+
== Usage
|
30
|
+
|
31
|
+
The CRM114 library interface is very similar to that of the
|
32
|
+
Classifier[http://rubyforge.org/projects/classifier/] project.
|
33
|
+
|
34
|
+
Here follows a brief example:
|
35
|
+
|
36
|
+
require 'crm114'
|
37
|
+
crm = Classifier::CRM114.new([:interesting, :boring])
|
38
|
+
crm.train! :interesting, 'Some data set with a decent signal to noise ratio.'
|
39
|
+
crm.train! :boring, 'Pig latin, as in lorem ipsum dolor sit amet.'
|
40
|
+
crm.classify 'Lorem ipsum' => [:boring, 0.99]
|
41
|
+
crm.interesting? 'Lorem ipsum' => false
|
42
|
+
crm.boring? 'Lorem ipsum' => true
|
43
|
+
|
44
|
+
Have a look at the included unit tests for more comprehensive examples.
|
45
|
+
|
46
|
+
== Related Projects
|
47
|
+
|
48
|
+
* http://www.elegantchaos.com/node/129 (crm.py)
|
49
|
+
* http://rubyforge.org/projects/classifier/
|
50
|
+
* http://rubyforge.org/projects/bishop/
|
51
|
+
|
52
|
+
== Author
|
53
|
+
|
54
|
+
Arto Bendiken (mailto:arto.bendiken@gmail.com) - http://bendiken.net
|
55
|
+
|
56
|
+
== License
|
57
|
+
|
58
|
+
Released under the terms of the MIT license. See the accompanying LICENSE
|
59
|
+
file for more information.
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/lib'))
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
PKG_NAME = 'crm114'
|
7
|
+
PKG_VERSION = Classifier::CRM114::VERSION
|
8
|
+
PKG_DESC = 'Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.'
|
9
|
+
PKG_URL = 'http://crm114.rubyforge.org/'
|
10
|
+
|
11
|
+
PKG_AUTHOR = 'Arto Bendiken'
|
12
|
+
PKG_EMAIL = 'arto.bendiken@gmail.com'
|
13
|
+
|
14
|
+
##############################################################################
|
15
|
+
|
16
|
+
require 'hoe'
|
17
|
+
|
18
|
+
Hoe.new(PKG_NAME, PKG_VERSION) do |p|
|
19
|
+
p.author = PKG_AUTHOR
|
20
|
+
p.email = PKG_EMAIL
|
21
|
+
p.url = PKG_URL
|
22
|
+
p.summary = PKG_DESC
|
23
|
+
p.description = p.paragraphs_of('README', 1).first
|
24
|
+
p.changes = p.paragraphs_of('CHANGELOG', 0..1).join("\n\n")
|
25
|
+
p.spec_extras = { :rdoc_options => ['--main', 'README'] }
|
26
|
+
end
|
27
|
+
|
28
|
+
##############################################################################
|
29
|
+
|
30
|
+
def egrep(pattern, files)
|
31
|
+
Dir[files].each do |file|
|
32
|
+
File.open(file).readlines.each_with_index do |line, lineno|
|
33
|
+
puts "#{file}:#{lineno + 1}:#{line}" if line =~ pattern
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
desc 'Look for TODO and FIXME tags in the code base.'
|
39
|
+
task :todo do
|
40
|
+
egrep /#.*(FIXME|TODO)/, '**/*.rb'
|
41
|
+
end
|
data/lib/crm114.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Author:: Arto Bendiken (mailto:arto.bendiken@gmail.com)
|
2
|
+
# Copyright:: Copyright (c) 2006 Arto Bendiken.
|
3
|
+
# License:: MIT
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
class CRM114
|
8
|
+
|
9
|
+
VERSION = '1.0.0'
|
10
|
+
|
11
|
+
CLASSIFICATION_TYPE = '<osb unique microgroom>'
|
12
|
+
FILE_EXTENSION = '.css'
|
13
|
+
CMD_CRM = '/usr/bin/env crm'
|
14
|
+
OPT_LEARN = '-{ learn %s ( %s ) }'
|
15
|
+
OPT_CLASSIFY = '-{ isolate (:stats:); classify %s ( %s ) (:stats:); match [:stats:] (:: :best: :prob:) /Best match to file .. \\(%s\\/([[:graph:]]+)\\%s\\) prob: ([0-9.]+)/; output /:*:best:\\t:*:prob:/ }'
|
16
|
+
|
17
|
+
# Returns a string containg the installed CRM114 engine version in a
|
18
|
+
# format such as "20060118-BlameTheReavers".
|
19
|
+
def self.version
|
20
|
+
$1 if IO.popen(CMD_CRM + ' -v', 'r') { |pipe| pipe.readline } =~ /CRM114, version ([\d\w\-\.]+)/
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a new CRM114 classifier defined by the given _categories_.
|
24
|
+
def initialize(categories, options = {})
|
25
|
+
@categories = categories.to_a.collect { |category| category.to_s.to_sym }
|
26
|
+
@path = File.expand_path(options[:path] || '.')
|
27
|
+
@debug = options[:debug] || false
|
28
|
+
end
|
29
|
+
|
30
|
+
# Trains the classifier to consider the given _text_ to be a sample from
|
31
|
+
# the set named by _category_.
|
32
|
+
def learn!(category, text, &block)
|
33
|
+
cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, css_file_path(category)]) + "'"
|
34
|
+
puts cmd if @debug
|
35
|
+
IO.popen(cmd, 'w') { |pipe| block_given? ? block.call(pipe) : pipe.write(text) }
|
36
|
+
end
|
37
|
+
|
38
|
+
alias_method :train!, :learn!
|
39
|
+
|
40
|
+
def unlearn!(category, text, &block) # :nodoc:
|
41
|
+
raise 'unlearning not supported at present'
|
42
|
+
end
|
43
|
+
|
44
|
+
alias_method :untrain!, :unlearn! #:nodoc:
|
45
|
+
|
46
|
+
# Returns the classification of the provided _text_ as a tuple
|
47
|
+
# containing the highest-probability category and a confidence indicator
|
48
|
+
# in the range of 0.5..1.0.
|
49
|
+
def classify(text = nil, &block)
|
50
|
+
files = @categories.collect { |category| css_file_path(category) }
|
51
|
+
cmd = CMD_CRM + " '" + (OPT_CLASSIFY % [CLASSIFICATION_TYPE, files.join(' '), @path.gsub(/\//, '\/'), FILE_EXTENSION]) + "'"
|
52
|
+
puts cmd if @debug
|
53
|
+
result = IO.popen(cmd, 'r+') do |pipe|
|
54
|
+
block_given? ? block.call(pipe) : pipe.write(text)
|
55
|
+
pipe.close_write
|
56
|
+
pipe.readline unless pipe.closed? || pipe.eof?
|
57
|
+
end
|
58
|
+
return [nil, 0.0] unless result && result.include?("\t")
|
59
|
+
result = result.split("\t")
|
60
|
+
[result.first.to_sym, result.last.to_f]
|
61
|
+
end
|
62
|
+
|
63
|
+
def method_missing(symbol, *args) # :nodoc:
|
64
|
+
case symbol.to_s[-1]
|
65
|
+
when ?!
|
66
|
+
category = symbol.to_s.chop.to_sym
|
67
|
+
return learn!(category, *args) if @categories.include?(category)
|
68
|
+
when ?? # it's a predicate
|
69
|
+
category = symbol.to_s.chop.to_sym
|
70
|
+
return classify(*args).first == category if @categories.include?(category)
|
71
|
+
end
|
72
|
+
super
|
73
|
+
end
|
74
|
+
|
75
|
+
protected
|
76
|
+
|
77
|
+
def self.create_css_file(file)
|
78
|
+
cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, file]) + "'"
|
79
|
+
IO.popen(cmd, 'w') { |pipe| pipe.close }
|
80
|
+
end
|
81
|
+
|
82
|
+
def css_file_path(category)
|
83
|
+
File.join(@path, category.to_s + FILE_EXTENSION)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
class TestCodeOrText < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@path = File.dirname(__FILE__)
|
10
|
+
@crm = Classifier::CRM114.new([:code, :text], :path => @path)
|
11
|
+
assert_nothing_raised do
|
12
|
+
Dir["#{@path}/../lib/*.rb"].each { |file| @crm.code! File.read(file) }
|
13
|
+
['CHANGELOG', 'README', 'LICENSE'].each { |file| @crm.text! File.read(file) }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def teardown
|
18
|
+
Dir["#{@path}/*.css"].each { |file| File.delete(file) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_code
|
22
|
+
assert @crm.code?('class DrStrangelove; def self.lesson; stop_worrying && love_the_bomb; end; end')
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_text
|
26
|
+
assert @crm.text?('This an interface to the Dr. Strangelove-inspired CRM114 Controllable Regex Mutilator.')
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/test/test_crm114.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
class TestCRM114 < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@path = File.dirname(__FILE__)
|
10
|
+
@crm = Classifier::CRM114.new([:interesting, :boring], :path => @path)
|
11
|
+
assert_nothing_raised do
|
12
|
+
@crm.train! :interesting, <<EOT
|
13
|
+
Computational processes are abstract beings that inhabit computers.
|
14
|
+
As they evolve, processes manipulate other abstract things called
|
15
|
+
data. The evolution of a process is directed by a pattern of rules
|
16
|
+
called a program. People create programs to direct processes. In
|
17
|
+
effect, we conjure the spirits of the computer with our spells.
|
18
|
+
EOT
|
19
|
+
@crm.train! :boring, <<EOT
|
20
|
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam vitae
|
21
|
+
nisi nec sapien congue porttitor. Proin quam risus, pharetra non,
|
22
|
+
lacinia sed, vehicula non, eros. Suspendisse velit augue, aliquet
|
23
|
+
vel, sagittis vitae, porttitor sed, metus. Integer tortor tellus,
|
24
|
+
tempus tincidunt, viverra a, fringilla vitae, sapien. Ut ac eros.
|
25
|
+
Donec molestie nulla sed nibh. Pellentesque quam quam, vehicula sed,
|
26
|
+
venenatis vitae, tristique quis, lectus. Aenean odio purus, pharetra
|
27
|
+
non, facilisis sed, rutrum eu, lectus. Curabitur odio. Ut laoreet
|
28
|
+
dolor vitae nunc. Donec dapibus. Morbi tempor libero et dolor.
|
29
|
+
Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus. Donec
|
30
|
+
vel arcu. Sed neque orci, accumsan et, faucibus quis, porttitor in,
|
31
|
+
lacus. Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
|
32
|
+
Phasellus et arcu. Mauris nunc.
|
33
|
+
EOT
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def teardown
|
38
|
+
Dir["#{@path}/*.css"].each { |file| File.delete(file) }
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_version
|
42
|
+
assert_match(/^[\d]+-[\w\d]+$/, Classifier::CRM114.version)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_unlearning
|
46
|
+
assert_raise(RuntimeError) { @crm.unlearn!(:boring, 'Lorem ipsum') }
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_interesting
|
50
|
+
assert_equal(:interesting, @crm.classify('Thus, programs must be written for people to read,').first)
|
51
|
+
assert_equal(true, @crm.interesting?('and only incidentally for machines to execute.'))
|
52
|
+
assert_equal(false, @crm.boring?('learning to program is considerably less dangerous than learning sorcery'))
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_boring
|
56
|
+
assert_equal(:boring, @crm.classify('Lorem ipsum dolor sit amet, sed neque orci.').first)
|
57
|
+
assert_equal(false, @crm.interesting?('Donec dapibus. Morbi tempor libero et dolor.'))
|
58
|
+
assert_equal(true, @crm.boring?('Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus.'))
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: crm114
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-11-06 00:00:00 +01:00
|
8
|
+
summary: Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
- test
|
12
|
+
email: arto.bendiken@gmail.com
|
13
|
+
homepage: http://crm114.rubyforge.org/
|
14
|
+
rubyforge_project: crm114
|
15
|
+
description: This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an advanced and fast text classifier that uses sparse binary polynomial matching with a Bayesian Chain Rule evaluator and a hidden Markov model to categorize data with up to a 99.87% accuracy.
|
16
|
+
autorequire:
|
17
|
+
default_executable:
|
18
|
+
bindir: bin
|
19
|
+
has_rdoc: true
|
20
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
|
+
requirements:
|
22
|
+
- - ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
25
|
+
version:
|
26
|
+
platform: ruby
|
27
|
+
signing_key:
|
28
|
+
cert_chain:
|
29
|
+
post_install_message:
|
30
|
+
authors:
|
31
|
+
- Arto Bendiken
|
32
|
+
files:
|
33
|
+
- CHANGELOG
|
34
|
+
- LICENSE
|
35
|
+
- Manifest.txt
|
36
|
+
- README
|
37
|
+
- Rakefile
|
38
|
+
- lib/crm114.rb
|
39
|
+
- test/test_code_or_text.rb
|
40
|
+
- test/test_crm114.rb
|
41
|
+
test_files: []
|
42
|
+
|
43
|
+
rdoc_options:
|
44
|
+
- --main
|
45
|
+
- README
|
46
|
+
extra_rdoc_files: []
|
47
|
+
|
48
|
+
executables: []
|
49
|
+
|
50
|
+
extensions: []
|
51
|
+
|
52
|
+
requirements: []
|
53
|
+
|
54
|
+
dependencies:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: hoe
|
57
|
+
version_requirement:
|
58
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.1.2
|
63
|
+
version:
|