ngrams_search 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.rdoc +69 -0
- data/bin/ngrams_search +40 -0
- data/lib/ngrams_search.rb +45 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZTAzOTJmOGYyODNjZWNkZDgyNzkzYTAzNTU2MThlNzAxNGI2OTMxNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDAyMzU2NzZkY2YxM2E1NGQ4NGU4YTJiZjdmMTJiZDIyZjBlNTZiZA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZDU1NzZlMWZmNzc3N2U0MTMzNTU5YzQ4MzZmODVmZmE1MDBiZmU4OGQ2OGI3
|
10
|
+
Y2U2NWUwNzMzZWNlYzY2NTMwMGJlOTg4ZGUxNGQ0NzlkZTlkMDkzMTdiNDI1
|
11
|
+
ZTllZTA2MzdiYTQ2ZDI0NzlhOGUzOWIzY2Y4OTYzYTE1ZDU5Y2E=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
N2U4NThhZTgxMDE3Yjg2ZWJhNzY3MWUyNmZjNjc4MTAxMjA1ZWI5NGZiYmM0
|
14
|
+
MGU1YzhmZTEyNzVkNTA4ZDJlMTQ3ZGJkNmMxZTRkMWJjNjcyYjA5MTQ4NjEw
|
15
|
+
NTA0MTJhNzg3YThjNWIwMDI1ZDhhNTMzYzAwYTQ4NDEzYmM1ZDM=
|
data/README.rdoc
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
= ngrams_search
|
2
|
+
|
3
|
+
Author:: Elias Hasnat
|
4
|
+
|
5
|
+
= Description
|
6
|
+
|
7
|
+
ngrams_search is an extension of Ruby's core String class. It provides a String
|
8
|
+
object with the capability to produce n-grams.
|
9
|
+
|
10
|
+
From Wikipedia,
|
11
|
+
"In the fields of computational linguistics and probability, an n-gram is a
|
12
|
+
contiguous sequence of n items from a given sequence of text or speech. The
|
13
|
+
items in question can be phonemes, syllables, letters, words or base pairs
|
14
|
+
according to the application. n-grams are collected from a text or speech corpus.
|
15
|
+
|
16
|
+
An n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram"
|
17
|
+
(or, less commonly, a "digram"); size 3 is a "trigram"; size 4 is a "four-gram"
|
18
|
+
and size 5 or more is simply called an "n-gram"."
|
19
|
+
|
20
|
+
= Design
|
21
|
+
|
22
|
+
Instead of creating another namespace, this task seemed simple enough to merit
|
23
|
+
extending the String class. A string is a sequence of characters.
|
24
|
+
It can be words, binary code, sentences, paragraphs, etc. In short,
|
25
|
+
anything that you can store in a Ruby String object can be parsed into
|
26
|
+
n-grams of length n.
|
27
|
+
|
28
|
+
The main method being added to the String class is ngrams(). It produces an
|
29
|
+
array of n-grams from a Ruby String object.
|
30
|
+
|
31
|
+
For example, let s be a Ruby String object.
|
32
|
+
Then s.ngrams() returns array of n-grams of from s.
|
33
|
+
|
34
|
+
Tokenization of s is set to single characters by default.
|
35
|
+
For example, if
|
36
|
+
s = "Hello World!",
|
37
|
+
then the tokens of s are
|
38
|
+
["H","e","l","l","o"," ","W","o","r","l","d","!"].
|
39
|
+
By specifying a regular expression, you can tokenize the string s in many
|
40
|
+
different and useful ways.
|
41
|
+
|
42
|
+
If you set n = 4, then
|
43
|
+
s.ngrams = [["H", "e", "l", "l"],
|
44
|
+
["e", "l", "l", "o"],
|
45
|
+
["l", "l", "o", " "],
|
46
|
+
["l", "o", " ", "W"],
|
47
|
+
["o", " ", "W", "o"],
|
48
|
+
[" ", "W", "o", "r"],
|
49
|
+
["W", "o", "r", "l"],
|
50
|
+
["o", "r", "l", "d"],
|
51
|
+
["r", "l", "d", "!"]].
|
52
|
+
|
53
|
+
Each item in the s.ngrams array can joined but doesn't need to be.
|
54
|
+
If you want to join them, normally you can do so easily if it is text.
|
55
|
+
Be careful if you are trying to join n-grams with non-printable characters.
|
56
|
+
|
57
|
+
You can google "n-grams" to get more information about how n-grams are useful.
|
58
|
+
|
59
|
+
= Installation
|
60
|
+
gem install ngrams_search
|
61
|
+
|
62
|
+
= Usage
|
63
|
+
|
64
|
+
You can simply run the executable and provide input via STDIN.
|
65
|
+
ngrams_search
|
66
|
+
|
67
|
+
You can also provide input via one or more filenames
|
68
|
+
ngrams_search [FILES]
|
69
|
+
|
data/bin/ngrams_search
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ruby_cli'
|
4
|
+
require 'ngrams_search'
|
5
|
+
|
6
|
+
class App
|
7
|
+
include RubyCLI
|
8
|
+
|
9
|
+
def initialize_command_options() @options = {:regex => //, :n => 2} end
|
10
|
+
|
11
|
+
def define_command_option_parsing
|
12
|
+
@opt_parser.on('-n', '--n NUM', Integer, 'set length n for n-grams') do |n|
|
13
|
+
@options[:n] = n
|
14
|
+
end
|
15
|
+
@opt_parser.on('-r', '--regex "REGEX"', Regexp, 'set regex to split string into tokens') do |r|
|
16
|
+
@options[:regex] = r
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def command
|
21
|
+
# If arguments were provided, then they have to be names of files.
|
22
|
+
# These files will be handled using Ruby's ARGF builtin variable.
|
23
|
+
# If arguments are not filenames, then this application will produce a
|
24
|
+
# a runtime error informing the user that the given file could not be opened.
|
25
|
+
|
26
|
+
# ARGF is a stream designed for use in scripts that process files given as
|
27
|
+
# command-line arguments or passed in via STDIN.
|
28
|
+
# The arguments passed to your script are stored in the ARGV Array,
|
29
|
+
# one argument per element. ARGF assumes that any arguments that aren’t
|
30
|
+
# filenames have been removed from ARGV.
|
31
|
+
text = ARGF.read
|
32
|
+
text.ngrams(@options).each { |ngram| puts ngram.inspect }
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
app = App.new(ARGV, __FILE__)
|
38
|
+
app.run
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# This is an extension of Ruby's core String class.
|
2
|
+
# It add methods to extract a set of n-grams from a string.
|
3
|
+
# Typically, the most used set of n-grams are unigrams, bigrams, and trigrams;
|
4
|
+
# sets of n-grams of length 1, 2, and 3, respectively.
|
5
|
+
class String
|
6
|
+
|
7
|
+
# An n-gram is a sequence of units of text of length n, where those units are
|
8
|
+
# typically single characters or words delimited by space characters.
|
9
|
+
# However, a token could also be a fixed length character sequence, strings
|
10
|
+
# with embedded spaces, etc. depending on the intended application.
|
11
|
+
# Typically, n-grams are formed of contiguous tokens.
|
12
|
+
#
|
13
|
+
# This function splits the string into a set of n-grams.
|
14
|
+
# The default regex used tokenizes the string into characters.
|
15
|
+
#
|
16
|
+
# Regex Examples:
|
17
|
+
# // => splits into characters
|
18
|
+
# /\s+/ => splits into words delimited by one or more space characters
|
19
|
+
# /\n+/ => splits into lines delimted by one or more newline characters
|
20
|
+
#
|
21
|
+
def ngrams(options = {:regex=>//, :n=>2})
|
22
|
+
ngrams = []
|
23
|
+
tokens = self.split(options[:regex])
|
24
|
+
max_pos = tokens.length - options[:n]
|
25
|
+
for i in 0..max_pos
|
26
|
+
ngrams.push(tokens[i..i+(options[:n]-1)])
|
27
|
+
end
|
28
|
+
ngrams
|
29
|
+
end
|
30
|
+
|
31
|
+
# This function splits the string into unigrams,
|
32
|
+
# tokenizes into chars by default
|
33
|
+
def unigrams(regex = //) ngrams({:regex => regex, :n => 1}); end
|
34
|
+
|
35
|
+
# This function splits the string into bigrams
|
36
|
+
# tokenizes into chars by default
|
37
|
+
def bigrams(regex = //) ngrams({:regex => regex, :n => 2}); end
|
38
|
+
|
39
|
+
# This function splits the string into trigrams
|
40
|
+
# tokenizes into chars by default
|
41
|
+
def trigrams(regex = //) ngrams({:regex => regex, :n => 3}); end
|
42
|
+
|
43
|
+
end #class String
|
44
|
+
|
45
|
+
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ngrams_search
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Elias Hasnat
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ruby_cli
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
description: n-grams string search
|
28
|
+
email: android.hasnat@gmail.com
|
29
|
+
executables:
|
30
|
+
- ngrams_search
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- lib/ngrams_search.rb
|
35
|
+
- bin/ngrams_search
|
36
|
+
- README.rdoc
|
37
|
+
homepage: http://github.com/claymodel/telephony/ngrams_search
|
38
|
+
licenses: []
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
- bin
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.0.7
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: Search string using n-grams
|
61
|
+
test_files: []
|
62
|
+
has_rdoc:
|