crm114 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -0
- data/LICENSE +19 -0
- data/Manifest.txt +8 -0
- data/README +59 -0
- data/Rakefile +41 -0
- data/lib/crm114.rb +88 -0
- data/test/test_code_or_text.rb +29 -0
- data/test/test_crm114.rb +61 -0
- metadata +63 -0
data/CHANGELOG
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2005-2006 Arto Bendiken <http://bendiken.net/>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
18
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
19
|
+
IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
data/README
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
== CRM114 Controllable Regex Mutilator for Ruby
|
2
|
+
|
3
|
+
This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an
|
4
|
+
advanced and fast text classifier that uses sparse binary polynomial
|
5
|
+
matching with a Bayesian Chain Rule evaluator and a hidden Markov model to
|
6
|
+
categorize data with up to a 99.87% accuracy.
|
7
|
+
|
8
|
+
The Ruby wrapper grew out of this:
|
9
|
+
* http://bendiken.net/2006/07/05/spam-filters-alien-technology-and-ruby-on-rails
|
10
|
+
|
11
|
+
=== About CRM114
|
12
|
+
|
13
|
+
* http://crm114.sourceforge.net
|
14
|
+
* http://en.wikipedia.org/wiki/CRM114
|
15
|
+
* http://en.wikipedia.org/wiki/Dr_Strangelove
|
16
|
+
* http://www.paulgraham.com/wsy.html
|
17
|
+
|
18
|
+
== Download
|
19
|
+
|
20
|
+
* http://rubyforge.org/projects/crm114
|
21
|
+
* gem install crm114
|
22
|
+
* svn checkout svn://rubyforge.org/var/svn/crm114
|
23
|
+
|
24
|
+
== Dependencies
|
25
|
+
|
26
|
+
Requires the CRM114 binaries to be installed. Specifically, the '+crm+'
|
27
|
+
binary should be accessible in the current user's PATH environment variable.
|
28
|
+
|
29
|
+
== Usage
|
30
|
+
|
31
|
+
The CRM114 library interface is very similar to that of the
|
32
|
+
Classifier[http://rubyforge.org/projects/classifier/] project.
|
33
|
+
|
34
|
+
Here follows a brief example:
|
35
|
+
|
36
|
+
require 'crm114'
|
37
|
+
crm = Classifier::CRM114.new([:interesting, :boring])
|
38
|
+
crm.train! :interesting, 'Some data set with a decent signal to noise ratio.'
|
39
|
+
crm.train! :boring, 'Pig latin, as in lorem ipsum dolor sit amet.'
|
40
|
+
crm.classify 'Lorem ipsum' => [:boring, 0.99]
|
41
|
+
crm.interesting? 'Lorem ipsum' => false
|
42
|
+
crm.boring? 'Lorem ipsum' => true
|
43
|
+
|
44
|
+
Have a look at the included unit tests for more comprehensive examples.
|
45
|
+
|
46
|
+
== Related Projects
|
47
|
+
|
48
|
+
* http://www.elegantchaos.com/node/129 (crm.py)
|
49
|
+
* http://rubyforge.org/projects/classifier/
|
50
|
+
* http://rubyforge.org/projects/bishop/
|
51
|
+
|
52
|
+
== Author
|
53
|
+
|
54
|
+
Arto Bendiken (mailto:arto.bendiken@gmail.com) - http://bendiken.net
|
55
|
+
|
56
|
+
== License
|
57
|
+
|
58
|
+
Released under the terms of the MIT license. See the accompanying LICENSE
|
59
|
+
file for more information.
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/lib'))
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
PKG_NAME = 'crm114'
|
7
|
+
PKG_VERSION = Classifier::CRM114::VERSION
|
8
|
+
PKG_DESC = 'Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.'
|
9
|
+
PKG_URL = 'http://crm114.rubyforge.org/'
|
10
|
+
|
11
|
+
PKG_AUTHOR = 'Arto Bendiken'
|
12
|
+
PKG_EMAIL = 'arto.bendiken@gmail.com'
|
13
|
+
|
14
|
+
##############################################################################
|
15
|
+
|
16
|
+
require 'hoe'
|
17
|
+
|
18
|
+
Hoe.new(PKG_NAME, PKG_VERSION) do |p|
|
19
|
+
p.author = PKG_AUTHOR
|
20
|
+
p.email = PKG_EMAIL
|
21
|
+
p.url = PKG_URL
|
22
|
+
p.summary = PKG_DESC
|
23
|
+
p.description = p.paragraphs_of('README', 1).first
|
24
|
+
p.changes = p.paragraphs_of('CHANGELOG', 0..1).join("\n\n")
|
25
|
+
p.spec_extras = { :rdoc_options => ['--main', 'README'] }
|
26
|
+
end
|
27
|
+
|
28
|
+
##############################################################################
|
29
|
+
|
30
|
+
def egrep(pattern, files)
|
31
|
+
Dir[files].each do |file|
|
32
|
+
File.open(file).readlines.each_with_index do |line, lineno|
|
33
|
+
puts "#{file}:#{lineno + 1}:#{line}" if line =~ pattern
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
desc 'Look for TODO and FIXME tags in the code base.'
|
39
|
+
task :todo do
|
40
|
+
egrep /#.*(FIXME|TODO)/, '**/*.rb'
|
41
|
+
end
|
data/lib/crm114.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Author:: Arto Bendiken (mailto:arto.bendiken@gmail.com)
|
2
|
+
# Copyright:: Copyright (c) 2006 Arto Bendiken.
|
3
|
+
# License:: MIT
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
class CRM114
|
8
|
+
|
9
|
+
VERSION = '1.0.0'
|
10
|
+
|
11
|
+
CLASSIFICATION_TYPE = '<osb unique microgroom>'
|
12
|
+
FILE_EXTENSION = '.css'
|
13
|
+
CMD_CRM = '/usr/bin/env crm'
|
14
|
+
OPT_LEARN = '-{ learn %s ( %s ) }'
|
15
|
+
OPT_CLASSIFY = '-{ isolate (:stats:); classify %s ( %s ) (:stats:); match [:stats:] (:: :best: :prob:) /Best match to file .. \\(%s\\/([[:graph:]]+)\\%s\\) prob: ([0-9.]+)/; output /:*:best:\\t:*:prob:/ }'
|
16
|
+
|
17
|
+
# Returns a string containg the installed CRM114 engine version in a
|
18
|
+
# format such as "20060118-BlameTheReavers".
|
19
|
+
def self.version
|
20
|
+
$1 if IO.popen(CMD_CRM + ' -v', 'r') { |pipe| pipe.readline } =~ /CRM114, version ([\d\w\-\.]+)/
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a new CRM114 classifier defined by the given _categories_.
|
24
|
+
def initialize(categories, options = {})
|
25
|
+
@categories = categories.to_a.collect { |category| category.to_s.to_sym }
|
26
|
+
@path = File.expand_path(options[:path] || '.')
|
27
|
+
@debug = options[:debug] || false
|
28
|
+
end
|
29
|
+
|
30
|
+
# Trains the classifier to consider the given _text_ to be a sample from
|
31
|
+
# the set named by _category_.
|
32
|
+
def learn!(category, text, &block)
|
33
|
+
cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, css_file_path(category)]) + "'"
|
34
|
+
puts cmd if @debug
|
35
|
+
IO.popen(cmd, 'w') { |pipe| block_given? ? block.call(pipe) : pipe.write(text) }
|
36
|
+
end
|
37
|
+
|
38
|
+
alias_method :train!, :learn!
|
39
|
+
|
40
|
+
def unlearn!(category, text, &block) # :nodoc:
|
41
|
+
raise 'unlearning not supported at present'
|
42
|
+
end
|
43
|
+
|
44
|
+
alias_method :untrain!, :unlearn! #:nodoc:
|
45
|
+
|
46
|
+
# Returns the classification of the provided _text_ as a tuple
|
47
|
+
# containing the highest-probability category and a confidence indicator
|
48
|
+
# in the range of 0.5..1.0.
|
49
|
+
def classify(text = nil, &block)
|
50
|
+
files = @categories.collect { |category| css_file_path(category) }
|
51
|
+
cmd = CMD_CRM + " '" + (OPT_CLASSIFY % [CLASSIFICATION_TYPE, files.join(' '), @path.gsub(/\//, '\/'), FILE_EXTENSION]) + "'"
|
52
|
+
puts cmd if @debug
|
53
|
+
result = IO.popen(cmd, 'r+') do |pipe|
|
54
|
+
block_given? ? block.call(pipe) : pipe.write(text)
|
55
|
+
pipe.close_write
|
56
|
+
pipe.readline unless pipe.closed? || pipe.eof?
|
57
|
+
end
|
58
|
+
return [nil, 0.0] unless result && result.include?("\t")
|
59
|
+
result = result.split("\t")
|
60
|
+
[result.first.to_sym, result.last.to_f]
|
61
|
+
end
|
62
|
+
|
63
|
+
def method_missing(symbol, *args) # :nodoc:
|
64
|
+
case symbol.to_s[-1]
|
65
|
+
when ?!
|
66
|
+
category = symbol.to_s.chop.to_sym
|
67
|
+
return learn!(category, *args) if @categories.include?(category)
|
68
|
+
when ?? # it's a predicate
|
69
|
+
category = symbol.to_s.chop.to_sym
|
70
|
+
return classify(*args).first == category if @categories.include?(category)
|
71
|
+
end
|
72
|
+
super
|
73
|
+
end
|
74
|
+
|
75
|
+
protected
|
76
|
+
|
77
|
+
def self.create_css_file(file)
|
78
|
+
cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, file]) + "'"
|
79
|
+
IO.popen(cmd, 'w') { |pipe| pipe.close }
|
80
|
+
end
|
81
|
+
|
82
|
+
def css_file_path(category)
|
83
|
+
File.join(@path, category.to_s + FILE_EXTENSION)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
class TestCodeOrText < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@path = File.dirname(__FILE__)
|
10
|
+
@crm = Classifier::CRM114.new([:code, :text], :path => @path)
|
11
|
+
assert_nothing_raised do
|
12
|
+
Dir["#{@path}/../lib/*.rb"].each { |file| @crm.code! File.read(file) }
|
13
|
+
['CHANGELOG', 'README', 'LICENSE'].each { |file| @crm.text! File.read(file) }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def teardown
|
18
|
+
Dir["#{@path}/*.css"].each { |file| File.delete(file) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_code
|
22
|
+
assert @crm.code?('class DrStrangelove; def self.lesson; stop_worrying && love_the_bomb; end; end')
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_text
|
26
|
+
assert @crm.text?('This an interface to the Dr. Strangelove-inspired CRM114 Controllable Regex Mutilator.')
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/test/test_crm114.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'crm114'
|
5
|
+
|
6
|
+
class TestCRM114 < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@path = File.dirname(__FILE__)
|
10
|
+
@crm = Classifier::CRM114.new([:interesting, :boring], :path => @path)
|
11
|
+
assert_nothing_raised do
|
12
|
+
@crm.train! :interesting, <<EOT
|
13
|
+
Computational processes are abstract beings that inhabit computers.
|
14
|
+
As they evolve, processes manipulate other abstract things called
|
15
|
+
data. The evolution of a process is directed by a pattern of rules
|
16
|
+
called a program. People create programs to direct processes. In
|
17
|
+
effect, we conjure the spirits of the computer with our spells.
|
18
|
+
EOT
|
19
|
+
@crm.train! :boring, <<EOT
|
20
|
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam vitae
|
21
|
+
nisi nec sapien congue porttitor. Proin quam risus, pharetra non,
|
22
|
+
lacinia sed, vehicula non, eros. Suspendisse velit augue, aliquet
|
23
|
+
vel, sagittis vitae, porttitor sed, metus. Integer tortor tellus,
|
24
|
+
tempus tincidunt, viverra a, fringilla vitae, sapien. Ut ac eros.
|
25
|
+
Donec molestie nulla sed nibh. Pellentesque quam quam, vehicula sed,
|
26
|
+
venenatis vitae, tristique quis, lectus. Aenean odio purus, pharetra
|
27
|
+
non, facilisis sed, rutrum eu, lectus. Curabitur odio. Ut laoreet
|
28
|
+
dolor vitae nunc. Donec dapibus. Morbi tempor libero et dolor.
|
29
|
+
Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus. Donec
|
30
|
+
vel arcu. Sed neque orci, accumsan et, faucibus quis, porttitor in,
|
31
|
+
lacus. Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
|
32
|
+
Phasellus et arcu. Mauris nunc.
|
33
|
+
EOT
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def teardown
|
38
|
+
Dir["#{@path}/*.css"].each { |file| File.delete(file) }
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_version
|
42
|
+
assert_match(/^[\d]+-[\w\d]+$/, Classifier::CRM114.version)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_unlearning
|
46
|
+
assert_raise(RuntimeError) { @crm.unlearn!(:boring, 'Lorem ipsum') }
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_interesting
|
50
|
+
assert_equal(:interesting, @crm.classify('Thus, programs must be written for people to read,').first)
|
51
|
+
assert_equal(true, @crm.interesting?('and only incidentally for machines to execute.'))
|
52
|
+
assert_equal(false, @crm.boring?('learning to program is considerably less dangerous than learning sorcery'))
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_boring
|
56
|
+
assert_equal(:boring, @crm.classify('Lorem ipsum dolor sit amet, sed neque orci.').first)
|
57
|
+
assert_equal(false, @crm.interesting?('Donec dapibus. Morbi tempor libero et dolor.'))
|
58
|
+
assert_equal(true, @crm.boring?('Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus.'))
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: crm114
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-11-06 00:00:00 +01:00
|
8
|
+
summary: Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
- test
|
12
|
+
email: arto.bendiken@gmail.com
|
13
|
+
homepage: http://crm114.rubyforge.org/
|
14
|
+
rubyforge_project: crm114
|
15
|
+
description: This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an advanced and fast text classifier that uses sparse binary polynomial matching with a Bayesian Chain Rule evaluator and a hidden Markov model to categorize data with up to a 99.87% accuracy.
|
16
|
+
autorequire:
|
17
|
+
default_executable:
|
18
|
+
bindir: bin
|
19
|
+
has_rdoc: true
|
20
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
|
+
requirements:
|
22
|
+
- - ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
25
|
+
version:
|
26
|
+
platform: ruby
|
27
|
+
signing_key:
|
28
|
+
cert_chain:
|
29
|
+
post_install_message:
|
30
|
+
authors:
|
31
|
+
- Arto Bendiken
|
32
|
+
files:
|
33
|
+
- CHANGELOG
|
34
|
+
- LICENSE
|
35
|
+
- Manifest.txt
|
36
|
+
- README
|
37
|
+
- Rakefile
|
38
|
+
- lib/crm114.rb
|
39
|
+
- test/test_code_or_text.rb
|
40
|
+
- test/test_crm114.rb
|
41
|
+
test_files: []
|
42
|
+
|
43
|
+
rdoc_options:
|
44
|
+
- --main
|
45
|
+
- README
|
46
|
+
extra_rdoc_files: []
|
47
|
+
|
48
|
+
executables: []
|
49
|
+
|
50
|
+
extensions: []
|
51
|
+
|
52
|
+
requirements: []
|
53
|
+
|
54
|
+
dependencies:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: hoe
|
57
|
+
version_requirement:
|
58
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.1.2
|
63
|
+
version:
|