crm114 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ == 1.0.0 / 2006-11-06
2
+
3
+ * Initial release of CRM114 library for Ruby.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2005-2006 Arto Bendiken <http://bendiken.net/>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19
+ IN THE SOFTWARE.
@@ -0,0 +1,8 @@
1
+ CHANGELOG
2
+ LICENSE
3
+ Manifest.txt
4
+ README
5
+ Rakefile
6
+ lib/crm114.rb
7
+ test/test_code_or_text.rb
8
+ test/test_crm114.rb
data/README ADDED
@@ -0,0 +1,59 @@
1
+ == CRM114 Controllable Regex Mutilator for Ruby
2
+
3
+ This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an
4
+ advanced and fast text classifier that uses sparse binary polynomial
5
+ matching with a Bayesian Chain Rule evaluator and a hidden Markov model to
6
+ categorize data with up to a 99.87% accuracy.
7
+
8
+ The Ruby wrapper grew out of this:
9
+ * http://bendiken.net/2006/07/05/spam-filters-alien-technology-and-ruby-on-rails
10
+
11
+ === About CRM114
12
+
13
+ * http://crm114.sourceforge.net
14
+ * http://en.wikipedia.org/wiki/CRM114
15
+ * http://en.wikipedia.org/wiki/Dr_Strangelove
16
+ * http://www.paulgraham.com/wsy.html
17
+
18
+ == Download
19
+
20
+ * http://rubyforge.org/projects/crm114
21
+ * gem install crm114
22
+ * svn checkout svn://rubyforge.org/var/svn/crm114
23
+
24
+ == Dependencies
25
+
26
+ Requires the CRM114 binaries to be installed. Specifically, the '+crm+'
27
+ binary should be accessible in the current user's PATH environment variable.
28
+
29
+ == Usage
30
+
31
+ The CRM114 library interface is very similar to that of the
32
+ Classifier[http://rubyforge.org/projects/classifier/] project.
33
+
34
+ Here follows a brief example:
35
+
36
+ require 'crm114'
37
+ crm = Classifier::CRM114.new([:interesting, :boring])
38
+ crm.train! :interesting, 'Some data set with a decent signal to noise ratio.'
39
+ crm.train! :boring, 'Pig latin, as in lorem ipsum dolor sit amet.'
40
+ crm.classify 'Lorem ipsum' => [:boring, 0.99]
41
+ crm.interesting? 'Lorem ipsum' => false
42
+ crm.boring? 'Lorem ipsum' => true
43
+
44
+ Have a look at the included unit tests for more comprehensive examples.
45
+
46
+ == Related Projects
47
+
48
+ * http://www.elegantchaos.com/node/129 (crm.py)
49
+ * http://rubyforge.org/projects/classifier/
50
+ * http://rubyforge.org/projects/bishop/
51
+
52
+ == Author
53
+
54
+ Arto Bendiken (mailto:arto.bendiken@gmail.com) - http://bendiken.net
55
+
56
+ == License
57
+
58
+ Released under the terms of the MIT license. See the accompanying LICENSE
59
+ file for more information.
@@ -0,0 +1,41 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/lib'))
2
+
3
+ require 'rubygems'
4
+ require 'crm114'
5
+
6
+ PKG_NAME = 'crm114'
7
+ PKG_VERSION = Classifier::CRM114::VERSION
8
+ PKG_DESC = 'Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.'
9
+ PKG_URL = 'http://crm114.rubyforge.org/'
10
+
11
+ PKG_AUTHOR = 'Arto Bendiken'
12
+ PKG_EMAIL = 'arto.bendiken@gmail.com'
13
+
14
+ ##############################################################################
15
+
16
+ require 'hoe'
17
+
18
+ Hoe.new(PKG_NAME, PKG_VERSION) do |p|
19
+ p.author = PKG_AUTHOR
20
+ p.email = PKG_EMAIL
21
+ p.url = PKG_URL
22
+ p.summary = PKG_DESC
23
+ p.description = p.paragraphs_of('README', 1).first
24
+ p.changes = p.paragraphs_of('CHANGELOG', 0..1).join("\n\n")
25
+ p.spec_extras = { :rdoc_options => ['--main', 'README'] }
26
+ end
27
+
28
+ ##############################################################################
29
+
30
+ def egrep(pattern, files)
31
+ Dir[files].each do |file|
32
+ File.open(file).readlines.each_with_index do |line, lineno|
33
+ puts "#{file}:#{lineno + 1}:#{line}" if line =~ pattern
34
+ end
35
+ end
36
+ end
37
+
38
+ desc 'Look for TODO and FIXME tags in the code base.'
39
+ task :todo do
40
+ egrep /#.*(FIXME|TODO)/, '**/*.rb'
41
+ end
@@ -0,0 +1,88 @@
1
+ # Author:: Arto Bendiken (mailto:arto.bendiken@gmail.com)
2
+ # Copyright:: Copyright (c) 2006 Arto Bendiken.
3
+ # License:: MIT
4
+
5
+ module Classifier
6
+
7
+ class CRM114
8
+
9
+ VERSION = '1.0.0'
10
+
11
+ CLASSIFICATION_TYPE = '<osb unique microgroom>'
12
+ FILE_EXTENSION = '.css'
13
+ CMD_CRM = '/usr/bin/env crm'
14
+ OPT_LEARN = '-{ learn %s ( %s ) }'
15
+ OPT_CLASSIFY = '-{ isolate (:stats:); classify %s ( %s ) (:stats:); match [:stats:] (:: :best: :prob:) /Best match to file .. \\(%s\\/([[:graph:]]+)\\%s\\) prob: ([0-9.]+)/; output /:*:best:\\t:*:prob:/ }'
16
+
17
+ # Returns a string containg the installed CRM114 engine version in a
18
+ # format such as "20060118-BlameTheReavers".
19
+ def self.version
20
+ $1 if IO.popen(CMD_CRM + ' -v', 'r') { |pipe| pipe.readline } =~ /CRM114, version ([\d\w\-\.]+)/
21
+ end
22
+
23
+ # Returns a new CRM114 classifier defined by the given _categories_.
24
+ def initialize(categories, options = {})
25
+ @categories = categories.to_a.collect { |category| category.to_s.to_sym }
26
+ @path = File.expand_path(options[:path] || '.')
27
+ @debug = options[:debug] || false
28
+ end
29
+
30
+ # Trains the classifier to consider the given _text_ to be a sample from
31
+ # the set named by _category_.
32
+ def learn!(category, text, &block)
33
+ cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, css_file_path(category)]) + "'"
34
+ puts cmd if @debug
35
+ IO.popen(cmd, 'w') { |pipe| block_given? ? block.call(pipe) : pipe.write(text) }
36
+ end
37
+
38
+ alias_method :train!, :learn!
39
+
40
+ def unlearn!(category, text, &block) # :nodoc:
41
+ raise 'unlearning not supported at present'
42
+ end
43
+
44
+ alias_method :untrain!, :unlearn! #:nodoc:
45
+
46
+ # Returns the classification of the provided _text_ as a tuple
47
+ # containing the highest-probability category and a confidence indicator
48
+ # in the range of 0.5..1.0.
49
+ def classify(text = nil, &block)
50
+ files = @categories.collect { |category| css_file_path(category) }
51
+ cmd = CMD_CRM + " '" + (OPT_CLASSIFY % [CLASSIFICATION_TYPE, files.join(' '), @path.gsub(/\//, '\/'), FILE_EXTENSION]) + "'"
52
+ puts cmd if @debug
53
+ result = IO.popen(cmd, 'r+') do |pipe|
54
+ block_given? ? block.call(pipe) : pipe.write(text)
55
+ pipe.close_write
56
+ pipe.readline unless pipe.closed? || pipe.eof?
57
+ end
58
+ return [nil, 0.0] unless result && result.include?("\t")
59
+ result = result.split("\t")
60
+ [result.first.to_sym, result.last.to_f]
61
+ end
62
+
63
+ def method_missing(symbol, *args) # :nodoc:
64
+ case symbol.to_s[-1]
65
+ when ?!
66
+ category = symbol.to_s.chop.to_sym
67
+ return learn!(category, *args) if @categories.include?(category)
68
+ when ?? # it's a predicate
69
+ category = symbol.to_s.chop.to_sym
70
+ return classify(*args).first == category if @categories.include?(category)
71
+ end
72
+ super
73
+ end
74
+
75
+ protected
76
+
77
+ def self.create_css_file(file)
78
+ cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, file]) + "'"
79
+ IO.popen(cmd, 'w') { |pipe| pipe.close }
80
+ end
81
+
82
+ def css_file_path(category)
83
+ File.join(@path, category.to_s + FILE_EXTENSION)
84
+ end
85
+
86
+ end
87
+
88
+ end
@@ -0,0 +1,29 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
2
+
3
+ require 'test/unit'
4
+ require 'crm114'
5
+
6
+ class TestCodeOrText < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @path = File.dirname(__FILE__)
10
+ @crm = Classifier::CRM114.new([:code, :text], :path => @path)
11
+ assert_nothing_raised do
12
+ Dir["#{@path}/../lib/*.rb"].each { |file| @crm.code! File.read(file) }
13
+ ['CHANGELOG', 'README', 'LICENSE'].each { |file| @crm.text! File.read(file) }
14
+ end
15
+ end
16
+
17
+ def teardown
18
+ Dir["#{@path}/*.css"].each { |file| File.delete(file) }
19
+ end
20
+
21
+ def test_code
22
+ assert @crm.code?('class DrStrangelove; def self.lesson; stop_worrying && love_the_bomb; end; end')
23
+ end
24
+
25
+ def test_text
26
+ assert @crm.text?('This an interface to the Dr. Strangelove-inspired CRM114 Controllable Regex Mutilator.')
27
+ end
28
+
29
+ end
@@ -0,0 +1,61 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
2
+
3
+ require 'test/unit'
4
+ require 'crm114'
5
+
6
+ class TestCRM114 < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @path = File.dirname(__FILE__)
10
+ @crm = Classifier::CRM114.new([:interesting, :boring], :path => @path)
11
+ assert_nothing_raised do
12
+ @crm.train! :interesting, <<EOT
13
+ Computational processes are abstract beings that inhabit computers.
14
+ As they evolve, processes manipulate other abstract things called
15
+ data. The evolution of a process is directed by a pattern of rules
16
+ called a program. People create programs to direct processes. In
17
+ effect, we conjure the spirits of the computer with our spells.
18
+ EOT
19
+ @crm.train! :boring, <<EOT
20
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam vitae
21
+ nisi nec sapien congue porttitor. Proin quam risus, pharetra non,
22
+ lacinia sed, vehicula non, eros. Suspendisse velit augue, aliquet
23
+ vel, sagittis vitae, porttitor sed, metus. Integer tortor tellus,
24
+ tempus tincidunt, viverra a, fringilla vitae, sapien. Ut ac eros.
25
+ Donec molestie nulla sed nibh. Pellentesque quam quam, vehicula sed,
26
+ venenatis vitae, tristique quis, lectus. Aenean odio purus, pharetra
27
+ non, facilisis sed, rutrum eu, lectus. Curabitur odio. Ut laoreet
28
+ dolor vitae nunc. Donec dapibus. Morbi tempor libero et dolor.
29
+ Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus. Donec
30
+ vel arcu. Sed neque orci, accumsan et, faucibus quis, porttitor in,
31
+ lacus. Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
32
+ Phasellus et arcu. Mauris nunc.
33
+ EOT
34
+ end
35
+ end
36
+
37
+ def teardown
38
+ Dir["#{@path}/*.css"].each { |file| File.delete(file) }
39
+ end
40
+
41
+ def test_version
42
+ assert_match(/^[\d]+-[\w\d]+$/, Classifier::CRM114.version)
43
+ end
44
+
45
+ def test_unlearning
46
+ assert_raise(RuntimeError) { @crm.unlearn!(:boring, 'Lorem ipsum') }
47
+ end
48
+
49
+ def test_interesting
50
+ assert_equal(:interesting, @crm.classify('Thus, programs must be written for people to read,').first)
51
+ assert_equal(true, @crm.interesting?('and only incidentally for machines to execute.'))
52
+ assert_equal(false, @crm.boring?('learning to program is considerably less dangerous than learning sorcery'))
53
+ end
54
+
55
+ def test_boring
56
+ assert_equal(:boring, @crm.classify('Lorem ipsum dolor sit amet, sed neque orci.').first)
57
+ assert_equal(false, @crm.interesting?('Donec dapibus. Morbi tempor libero et dolor.'))
58
+ assert_equal(true, @crm.boring?('Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus.'))
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: crm114
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2006-11-06 00:00:00 +01:00
8
+ summary: Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.
9
+ require_paths:
10
+ - lib
11
+ - test
12
+ email: arto.bendiken@gmail.com
13
+ homepage: http://crm114.rubyforge.org/
14
+ rubyforge_project: crm114
15
+ description: This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an advanced and fast text classifier that uses sparse binary polynomial matching with a Bayesian Chain Rule evaluator and a hidden Markov model to categorize data with up to a 99.87% accuracy.
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ - - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ signing_key:
28
+ cert_chain:
29
+ post_install_message:
30
+ authors:
31
+ - Arto Bendiken
32
+ files:
33
+ - CHANGELOG
34
+ - LICENSE
35
+ - Manifest.txt
36
+ - README
37
+ - Rakefile
38
+ - lib/crm114.rb
39
+ - test/test_code_or_text.rb
40
+ - test/test_crm114.rb
41
+ test_files: []
42
+
43
+ rdoc_options:
44
+ - --main
45
+ - README
46
+ extra_rdoc_files: []
47
+
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ requirements: []
53
+
54
+ dependencies:
55
+ - !ruby/object:Gem::Dependency
56
+ name: hoe
57
+ version_requirement:
58
+ version_requirements: !ruby/object:Gem::Version::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 1.1.2
63
+ version: