crm114 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ == 1.0.0 / 2006-11-06
2
+
3
+ * Initial release of CRM114 library for Ruby.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2005-2006 Arto Bendiken <http://bendiken.net/>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19
+ IN THE SOFTWARE.
@@ -0,0 +1,8 @@
1
+ CHANGELOG
2
+ LICENSE
3
+ Manifest.txt
4
+ README
5
+ Rakefile
6
+ lib/crm114.rb
7
+ test/test_code_or_text.rb
8
+ test/test_crm114.rb
data/README ADDED
@@ -0,0 +1,59 @@
1
+ == CRM114 Controllable Regex Mutilator for Ruby
2
+
3
+ This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an
4
+ advanced and fast text classifier that uses sparse binary polynomial
5
+ matching with a Bayesian Chain Rule evaluator and a hidden Markov model to
6
+ categorize data with up to a 99.87% accuracy.
7
+
8
+ The Ruby wrapper grew out of this:
9
+ * http://bendiken.net/2006/07/05/spam-filters-alien-technology-and-ruby-on-rails
10
+
11
+ === About CRM114
12
+
13
+ * http://crm114.sourceforge.net
14
+ * http://en.wikipedia.org/wiki/CRM114
15
+ * http://en.wikipedia.org/wiki/Dr_Strangelove
16
+ * http://www.paulgraham.com/wsy.html
17
+
18
+ == Download
19
+
20
+ * http://rubyforge.org/projects/crm114
21
+ * gem install crm114
22
+ * svn checkout svn://rubyforge.org/var/svn/crm114
23
+
24
+ == Dependencies
25
+
26
+ Requires the CRM114 binaries to be installed. Specifically, the '+crm+'
27
+ binary should be accessible in the current user's PATH environment variable.
28
+
29
+ == Usage
30
+
31
+ The CRM114 library interface is very similar to that of the
32
+ Classifier[http://rubyforge.org/projects/classifier/] project.
33
+
34
+ Here follows a brief example:
35
+
36
+ require 'crm114'
37
+ crm = Classifier::CRM114.new([:interesting, :boring])
38
+ crm.train! :interesting, 'Some data set with a decent signal to noise ratio.'
39
+ crm.train! :boring, 'Pig latin, as in lorem ipsum dolor sit amet.'
40
+ crm.classify 'Lorem ipsum' => [:boring, 0.99]
41
+ crm.interesting? 'Lorem ipsum' => false
42
+ crm.boring? 'Lorem ipsum' => true
43
+
44
+ Have a look at the included unit tests for more comprehensive examples.
45
+
46
+ == Related Projects
47
+
48
+ * http://www.elegantchaos.com/node/129 (crm.py)
49
+ * http://rubyforge.org/projects/classifier/
50
+ * http://rubyforge.org/projects/bishop/
51
+
52
+ == Author
53
+
54
+ Arto Bendiken (mailto:arto.bendiken@gmail.com) - http://bendiken.net
55
+
56
+ == License
57
+
58
+ Released under the terms of the MIT license. See the accompanying LICENSE
59
+ file for more information.
@@ -0,0 +1,41 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/lib'))
2
+
3
+ require 'rubygems'
4
+ require 'crm114'
5
+
6
+ PKG_NAME = 'crm114'
7
+ PKG_VERSION = Classifier::CRM114::VERSION
8
+ PKG_DESC = 'Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.'
9
+ PKG_URL = 'http://crm114.rubyforge.org/'
10
+
11
+ PKG_AUTHOR = 'Arto Bendiken'
12
+ PKG_EMAIL = 'arto.bendiken@gmail.com'
13
+
14
+ ##############################################################################
15
+
16
+ require 'hoe'
17
+
18
+ Hoe.new(PKG_NAME, PKG_VERSION) do |p|
19
+ p.author = PKG_AUTHOR
20
+ p.email = PKG_EMAIL
21
+ p.url = PKG_URL
22
+ p.summary = PKG_DESC
23
+ p.description = p.paragraphs_of('README', 1).first
24
+ p.changes = p.paragraphs_of('CHANGELOG', 0..1).join("\n\n")
25
+ p.spec_extras = { :rdoc_options => ['--main', 'README'] }
26
+ end
27
+
28
+ ##############################################################################
29
+
30
+ def egrep(pattern, files)
31
+ Dir[files].each do |file|
32
+ File.open(file).readlines.each_with_index do |line, lineno|
33
+ puts "#{file}:#{lineno + 1}:#{line}" if line =~ pattern
34
+ end
35
+ end
36
+ end
37
+
38
+ desc 'Look for TODO and FIXME tags in the code base.'
39
+ task :todo do
40
+ egrep /#.*(FIXME|TODO)/, '**/*.rb'
41
+ end
@@ -0,0 +1,88 @@
1
+ # Author:: Arto Bendiken (mailto:arto.bendiken@gmail.com)
2
+ # Copyright:: Copyright (c) 2006 Arto Bendiken.
3
+ # License:: MIT
4
+
5
+ module Classifier
6
+
7
+ class CRM114
8
+
9
+ VERSION = '1.0.0'
10
+
11
+ CLASSIFICATION_TYPE = '<osb unique microgroom>'
12
+ FILE_EXTENSION = '.css'
13
+ CMD_CRM = '/usr/bin/env crm'
14
+ OPT_LEARN = '-{ learn %s ( %s ) }'
15
+ OPT_CLASSIFY = '-{ isolate (:stats:); classify %s ( %s ) (:stats:); match [:stats:] (:: :best: :prob:) /Best match to file .. \\(%s\\/([[:graph:]]+)\\%s\\) prob: ([0-9.]+)/; output /:*:best:\\t:*:prob:/ }'
16
+
17
+ # Returns a string containg the installed CRM114 engine version in a
18
+ # format such as "20060118-BlameTheReavers".
19
+ def self.version
20
+ $1 if IO.popen(CMD_CRM + ' -v', 'r') { |pipe| pipe.readline } =~ /CRM114, version ([\d\w\-\.]+)/
21
+ end
22
+
23
+ # Returns a new CRM114 classifier defined by the given _categories_.
24
+ def initialize(categories, options = {})
25
+ @categories = categories.to_a.collect { |category| category.to_s.to_sym }
26
+ @path = File.expand_path(options[:path] || '.')
27
+ @debug = options[:debug] || false
28
+ end
29
+
30
+ # Trains the classifier to consider the given _text_ to be a sample from
31
+ # the set named by _category_.
32
+ def learn!(category, text, &block)
33
+ cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, css_file_path(category)]) + "'"
34
+ puts cmd if @debug
35
+ IO.popen(cmd, 'w') { |pipe| block_given? ? block.call(pipe) : pipe.write(text) }
36
+ end
37
+
38
+ alias_method :train!, :learn!
39
+
40
+ def unlearn!(category, text, &block) # :nodoc:
41
+ raise 'unlearning not supported at present'
42
+ end
43
+
44
+ alias_method :untrain!, :unlearn! #:nodoc:
45
+
46
+ # Returns the classification of the provided _text_ as a tuple
47
+ # containing the highest-probability category and a confidence indicator
48
+ # in the range of 0.5..1.0.
49
+ def classify(text = nil, &block)
50
+ files = @categories.collect { |category| css_file_path(category) }
51
+ cmd = CMD_CRM + " '" + (OPT_CLASSIFY % [CLASSIFICATION_TYPE, files.join(' '), @path.gsub(/\//, '\/'), FILE_EXTENSION]) + "'"
52
+ puts cmd if @debug
53
+ result = IO.popen(cmd, 'r+') do |pipe|
54
+ block_given? ? block.call(pipe) : pipe.write(text)
55
+ pipe.close_write
56
+ pipe.readline unless pipe.closed? || pipe.eof?
57
+ end
58
+ return [nil, 0.0] unless result && result.include?("\t")
59
+ result = result.split("\t")
60
+ [result.first.to_sym, result.last.to_f]
61
+ end
62
+
63
+ def method_missing(symbol, *args) # :nodoc:
64
+ case symbol.to_s[-1]
65
+ when ?!
66
+ category = symbol.to_s.chop.to_sym
67
+ return learn!(category, *args) if @categories.include?(category)
68
+ when ?? # it's a predicate
69
+ category = symbol.to_s.chop.to_sym
70
+ return classify(*args).first == category if @categories.include?(category)
71
+ end
72
+ super
73
+ end
74
+
75
+ protected
76
+
77
+ def self.create_css_file(file)
78
+ cmd = CMD_CRM + " '" + (OPT_LEARN % [CLASSIFICATION_TYPE, file]) + "'"
79
+ IO.popen(cmd, 'w') { |pipe| pipe.close }
80
+ end
81
+
82
+ def css_file_path(category)
83
+ File.join(@path, category.to_s + FILE_EXTENSION)
84
+ end
85
+
86
+ end
87
+
88
+ end
@@ -0,0 +1,29 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
2
+
3
+ require 'test/unit'
4
+ require 'crm114'
5
+
6
+ class TestCodeOrText < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @path = File.dirname(__FILE__)
10
+ @crm = Classifier::CRM114.new([:code, :text], :path => @path)
11
+ assert_nothing_raised do
12
+ Dir["#{@path}/../lib/*.rb"].each { |file| @crm.code! File.read(file) }
13
+ ['CHANGELOG', 'README', 'LICENSE'].each { |file| @crm.text! File.read(file) }
14
+ end
15
+ end
16
+
17
+ def teardown
18
+ Dir["#{@path}/*.css"].each { |file| File.delete(file) }
19
+ end
20
+
21
+ def test_code
22
+ assert @crm.code?('class DrStrangelove; def self.lesson; stop_worrying && love_the_bomb; end; end')
23
+ end
24
+
25
+ def test_text
26
+ assert @crm.text?('This an interface to the Dr. Strangelove-inspired CRM114 Controllable Regex Mutilator.')
27
+ end
28
+
29
+ end
@@ -0,0 +1,61 @@
1
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
2
+
3
+ require 'test/unit'
4
+ require 'crm114'
5
+
6
+ class TestCRM114 < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @path = File.dirname(__FILE__)
10
+ @crm = Classifier::CRM114.new([:interesting, :boring], :path => @path)
11
+ assert_nothing_raised do
12
+ @crm.train! :interesting, <<EOT
13
+ Computational processes are abstract beings that inhabit computers.
14
+ As they evolve, processes manipulate other abstract things called
15
+ data. The evolution of a process is directed by a pattern of rules
16
+ called a program. People create programs to direct processes. In
17
+ effect, we conjure the spirits of the computer with our spells.
18
+ EOT
19
+ @crm.train! :boring, <<EOT
20
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam vitae
21
+ nisi nec sapien congue porttitor. Proin quam risus, pharetra non,
22
+ lacinia sed, vehicula non, eros. Suspendisse velit augue, aliquet
23
+ vel, sagittis vitae, porttitor sed, metus. Integer tortor tellus,
24
+ tempus tincidunt, viverra a, fringilla vitae, sapien. Ut ac eros.
25
+ Donec molestie nulla sed nibh. Pellentesque quam quam, vehicula sed,
26
+ venenatis vitae, tristique quis, lectus. Aenean odio purus, pharetra
27
+ non, facilisis sed, rutrum eu, lectus. Curabitur odio. Ut laoreet
28
+ dolor vitae nunc. Donec dapibus. Morbi tempor libero et dolor.
29
+ Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus. Donec
30
+ vel arcu. Sed neque orci, accumsan et, faucibus quis, porttitor in,
31
+ lacus. Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
32
+ Phasellus et arcu. Mauris nunc.
33
+ EOT
34
+ end
35
+ end
36
+
37
+ def teardown
38
+ Dir["#{@path}/*.css"].each { |file| File.delete(file) }
39
+ end
40
+
41
+ def test_version
42
+ assert_match(/^[\d]+-[\w\d]+$/, Classifier::CRM114.version)
43
+ end
44
+
45
+ def test_unlearning
46
+ assert_raise(RuntimeError) { @crm.unlearn!(:boring, 'Lorem ipsum') }
47
+ end
48
+
49
+ def test_interesting
50
+ assert_equal(:interesting, @crm.classify('Thus, programs must be written for people to read,').first)
51
+ assert_equal(true, @crm.interesting?('and only incidentally for machines to execute.'))
52
+ assert_equal(false, @crm.boring?('learning to program is considerably less dangerous than learning sorcery'))
53
+ end
54
+
55
+ def test_boring
56
+ assert_equal(:boring, @crm.classify('Lorem ipsum dolor sit amet, sed neque orci.').first)
57
+ assert_equal(false, @crm.interesting?('Donec dapibus. Morbi tempor libero et dolor.'))
58
+ assert_equal(true, @crm.boring?('Aliquam rutrum metus quis nibh. Ut pharetra turpis vel metus.'))
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: crm114
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2006-11-06 00:00:00 +01:00
8
+ summary: Ruby interface to the CRM114 Controllable Regex Mutilator text classification engine.
9
+ require_paths:
10
+ - lib
11
+ - test
12
+ email: arto.bendiken@gmail.com
13
+ homepage: http://crm114.rubyforge.org/
14
+ rubyforge_project: crm114
15
+ description: This is a Ruby interface to the CRM114 Controllable Regex Mutilator, an advanced and fast text classifier that uses sparse binary polynomial matching with a Bayesian Chain Rule evaluator and a hidden Markov model to categorize data with up to a 99.87% accuracy.
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ - - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ signing_key:
28
+ cert_chain:
29
+ post_install_message:
30
+ authors:
31
+ - Arto Bendiken
32
+ files:
33
+ - CHANGELOG
34
+ - LICENSE
35
+ - Manifest.txt
36
+ - README
37
+ - Rakefile
38
+ - lib/crm114.rb
39
+ - test/test_code_or_text.rb
40
+ - test/test_crm114.rb
41
+ test_files: []
42
+
43
+ rdoc_options:
44
+ - --main
45
+ - README
46
+ extra_rdoc_files: []
47
+
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ requirements: []
53
+
54
+ dependencies:
55
+ - !ruby/object:Gem::Dependency
56
+ name: hoe
57
+ version_requirement:
58
+ version_requirements: !ruby/object:Gem::Version::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 1.1.2
63
+ version: