ruby_ngrams 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/bin/ruby_ngrams +58 -0
  2. data/lib/ruby_ngrams.rb +47 -0
  3. metadata +5 -5
data/bin/ruby_ngrams CHANGED
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'ruby_cli'
4
+ require 'ruby_ngrams'
5
+
6
+ class App
7
+ include RubyCLI
8
+
9
+ def define_command_options() @options = {:regex => //, :n => 2} end
10
+
11
+ # Define an OptionParser to parse the command line
12
+ def parse_options?
13
+ #configure an OptionParser
14
+ @opt_parser = OptionParser.new do |opts|
15
+ opts.banner = "Usage: #{__FILE__} [OPTIONS]... [FILE]..."
16
+ opts.separator ""
17
+ opts.separator "Specific options:"
18
+ opts.on('-h', '--help', 'displays help information') do
19
+ @default_options[:help] = true
20
+ output_help
21
+ end
22
+ opts.on('-V','--verbose','Run verbosely') do
23
+ @default_options[:verbose] = true
24
+ end
25
+ opts.on('-n', '--n NUM', Integer, 'set length n for n-grams') do |n|
26
+ @options[:n] = n
27
+ end
28
+ opts.on('-r', '--regex REGEX', Regexp, 'set regex to split string into tokens') do |r|
29
+ @options[:regex] = r
30
+ end
31
+ end
32
+ @opt_parser.parse!(@default_argv) rescue return false
33
+ true
34
+ end
35
+
36
+ def command
37
+ # If arguments were provided, then they have to be names of files.
38
+ # These files will be handled using Ruby's ARGF builtin variable.
39
+ # If arguments are not filenames, then this application will produce a
40
+ # a runtime error informing the user that the given file could not be opened.
41
+
42
+ # ARGF is a stream designed for use in scripts that process files given as
43
+ # command-line arguments or passed in via STDIN.
44
+ # The arguments passed to your script are stored in the ARGV Array,
45
+ # one argument per element. ARGF assumes that any arguments that aren’t
46
+ # filenames have been removed from ARGV.
47
+ text = ARGF.read
48
+ text.ngrams(@options).each { |ngram| puts ngram.inspect }
49
+ end
50
+
51
+ end
52
+
53
+
54
+ if __FILE__ == $0
55
+ app = App.new(ARGV)
56
+ app.run
57
+ end
58
+
data/lib/ruby_ngrams.rb CHANGED
@@ -0,0 +1,47 @@
1
+ # This is an extension of Ruby's core String class.
2
+ # It add methods to extract a set of n-grams from a string.
3
+ # Typically, the most used set of n-grams are unigrams, bigrams, and trigrams;
4
+ # sets of n-grams of length 1, 2, and 3, respectively.
5
+ class String
6
+
7
+ # An n-gram is a sequence of units of text of length n, where those units are
8
+ # typically single characters or words delimited by space characters.
9
+ # However, a token could also be a fixed length character sequence, strings
10
+ # with embedded spaces, etc. depending on the intended application.
11
+ # Typically, n-grams are formed of contiguous tokens.
12
+ #
13
+ # This function splits the string into a set of n-grams.
14
+ # The default regex used tokenizes the string into characters.
15
+ #
16
+ # Regex Examples:
17
+ # // => splits into characters
18
+ # /\s+/ => splits into words delimited by one or more space characters
19
+ # /\n+/ => splits into lines delimted by one or more newline characters
20
+ #
21
+ # TODO: Determine efficiency of this function on long strings.
22
+ # TODO: Determine how well this works on strings in binary format.
23
+ def ngrams(options = {:regex=>//, :n=>2})
24
+ ngrams = []
25
+ tokens = self.split(options[:regex])
26
+ max_pos = tokens.length - options[:n]
27
+ for i in 0..max_pos
28
+ ngrams.push(tokens[i..i+(options[:n]-1)])
29
+ end
30
+ ngrams
31
+ end
32
+
33
+ # This function splits the string into unigrams,
34
+ # tokenizes into chars by default
35
+ def unigrams(regex = //) ngrams({:regex => regex, :n => 1}); end
36
+
37
+ # This function splits the string into bigrams
38
+ # tokenizes into chars by default
39
+ def bigrams(regex = //) ngrams({:regex => regex, :n => 2}); end
40
+
41
+ # This function splits the string into trigrams
42
+ # tokenizes into chars by default
43
+ def trigrams(regex = //) ngrams({:regex => regex, :n => 3}); end
44
+
45
+ end #class String
46
+
47
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_ngrams
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Martin Velez
@@ -15,11 +15,11 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-10-19 00:00:00 -07:00
18
+ date: 2011-11-11 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
22
- description: A simple Ruby library to parse a string into n-grams
22
+ description: A simple extension of the Ruby core string class to parse a string into n-grams
23
23
  email: mvelez999@gmail.com
24
24
  executables:
25
25
  - ruby_ngrams