frequent-algorithm 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -7
- data/CHANGELOG +23 -9
- data/LICENSE +22 -22
- data/README.md +159 -149
- data/lib/frequent-algorithm.rb +28 -28
- data/lib/frequent/algorithm.rb +201 -182
- data/lib/frequent/version.rb +38 -38
- metadata +17 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ab3af8c9299159321aa18f30c5022ff7e8b3042
|
4
|
+
data.tar.gz: 01250f9d283874ae66fb0660415ac74476724b43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c38e275789110b1abac68a91f60f1a952aac05ddba011d22e213c38e33132b6261d92fc05b994b3310422c4081e6b6e1018c1854cf4f9562c3aea9e40d680a12
|
7
|
+
data.tar.gz: 8a84db9ba16c809a7700973cd62738f8cbd4b19461bff56f6c6f9192000e2e0cc4affc136576682cdcf29fb18a37b0653197404773426ab40629e0a1c493ced3
|
data/.yardopts
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
--no-private
|
2
|
-
--readme README.md
|
3
|
-
--markup markdown
|
4
|
-
--markup-provider rdiscount
|
5
|
-
-
|
6
|
-
LICENSE
|
7
|
-
CHANGELOG
|
1
|
+
--no-private
|
2
|
+
--readme README.md
|
3
|
+
--markup markdown
|
4
|
+
--markup-provider rdiscount
|
5
|
+
-
|
6
|
+
LICENSE
|
7
|
+
CHANGELOG
|
data/CHANGELOG
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
-
## CHANGELOG
|
2
|
-
|
3
|
-
- __2015/
|
4
|
-
-
|
5
|
-
-
|
6
|
-
-
|
7
|
-
|
8
|
-
-
|
9
|
-
|
1
|
+
## CHANGELOG
|
2
|
+
|
3
|
+
- __2015/05/06__ 0.0.4 release.
|
4
|
+
- Issue 17: Enhance Algorithm: new method to return top-k statistics
|
5
|
+
- Issue 42: Enhance algorithm: Accept one element as parameter in the process method
|
6
|
+
- Issue 45: FIX markdown in CHANGELOG
|
7
|
+
- Issue 53: Get this project to run/test against Rubinius
|
8
|
+
- Initial work to make the code thread-safe
|
9
|
+
|
10
|
+
- __2015/03/23__ 0.0.3 release.
|
11
|
+
- Further refinements to process.
|
12
|
+
- Enhanced strategy for calculating kth largest element in list.
|
13
|
+
- Resolved Issue 24: Refactor - Consistent internal data structure.
|
14
|
+
- Wontfix Issue 28: Add new test cases using String as input.
|
15
|
+
- On-going refinements for unit tests.
|
16
|
+
|
17
|
+
- __2015/03/19__ 0.0.2 release.
|
18
|
+
- First-stage implementation.
|
19
|
+
- API documentation added.
|
20
|
+
- Fleshing out unit tests.
|
21
|
+
|
22
|
+
- __2015/03/11__: 0.0.1 release.
|
23
|
+
- Initial release.
|
data/LICENSE
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
The MIT License (MIT)
|
2
|
-
|
3
|
-
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
22
|
-
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
22
|
+
|
data/README.md
CHANGED
@@ -1,149 +1,159 @@
|
|
1
|
-
# frequent-algorithm
|
2
|
-
|
3
|
-
Web site usage, social network behavior and Internet traffic are examples
|
4
|
-
of systems that appear to follow the [Power law](http://en.wikipedia.org/wiki/Power_law),
|
5
|
-
where most of the events are due to the actions of a very small few.
|
6
|
-
Knowing at any given point in time which items are trending is valuable
|
7
|
-
in understanding the system.
|
8
|
-
|
9
|
-
`frequent-algorithm` is a Ruby implementation of the FREQUENT algorithm
|
10
|
-
for identifying frequent items in a data stream in sliding windows.
|
11
|
-
Please refer to [Identifying Frequent Items in Sliding Windows over On-Line
|
12
|
-
Packet Streams](http://erikdemaine.org/papers/SlidingWindow_IMC2003/), by
|
13
|
-
Golab, DeHaan, Demaine, López-Ortiz and Munro (2003).
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
Challenges
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
*
|
25
|
-
*
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
*
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
>
|
39
|
-
>
|
40
|
-
>
|
41
|
-
>
|
42
|
-
>  
|
43
|
-
>
|
44
|
-
>
|
45
|
-
>
|
46
|
-
>
|
47
|
-
>
|
48
|
-
>
|
49
|
-
>
|
50
|
-
>  
|
51
|
-
>
|
52
|
-
>
|
53
|
-
>
|
54
|
-
>
|
55
|
-
>  
|
56
|
-
>  
|
57
|
-
>
|
58
|
-
>  
|
59
|
-
>
|
60
|
-
>
|
61
|
-
> &
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
data
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
* [
|
96
|
-
* [
|
97
|
-
* [`
|
98
|
-
* [`
|
99
|
-
* [`
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
rake
|
108
|
-
rake
|
109
|
-
rake
|
110
|
-
rake
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
1
|
+
# frequent-algorithm
|
2
|
+
|
3
|
+
Web site usage, social network behavior and Internet traffic are examples
|
4
|
+
of systems that appear to follow the [Power law](http://en.wikipedia.org/wiki/Power_law),
|
5
|
+
where most of the events are due to the actions of a very small few.
|
6
|
+
Knowing at any given point in time which items are trending is valuable
|
7
|
+
in understanding the system.
|
8
|
+
|
9
|
+
`frequent-algorithm` is a Ruby implementation of the FREQUENT algorithm
|
10
|
+
for identifying frequent items in a data stream in sliding windows.
|
11
|
+
Please refer to [Identifying Frequent Items in Sliding Windows over On-Line
|
12
|
+
Packet Streams](http://erikdemaine.org/papers/SlidingWindow_IMC2003/), by
|
13
|
+
Golab, DeHaan, Demaine, López-Ortiz and Munro (2003).
|
14
|
+
|
15
|
+
[]() [](https://travis-ci.org/buruzaemon/frequent-algorithm) [](https://rubygems.org/gems/frequent-algorithm)
|
16
|
+
|
17
|
+
## Introduction
|
18
|
+
|
19
|
+
### Challenges
|
20
|
+
|
21
|
+
Challenges for Real-time processing of data streams for _frequent item queries_
|
22
|
+
include:
|
23
|
+
|
24
|
+
* data may be of unknown and possibly unbound length
|
25
|
+
* data may be arriving a very fast rate
|
26
|
+
* it might not be possible to go back and re-read the data
|
27
|
+
* too large a window of observation may include stale data
|
28
|
+
|
29
|
+
Therefore, a solution should have the following characteristics:
|
30
|
+
|
31
|
+
* uses limited memory
|
32
|
+
* can process events in the stream in Ο(1) constant time
|
33
|
+
* requires only a single-pass over the data
|
34
|
+
|
35
|
+
|
36
|
+
### The algorithm
|
37
|
+
|
38
|
+
> LOOP<br/>
|
39
|
+
> 1. For each element e in the next b elements:<br/>
|
40
|
+
> If a local counter exists for the type of element e:<br/>
|
41
|
+
> Increment the local counter.<br/>
|
42
|
+
> Otherwise:<br/>
|
43
|
+
> Create a new local counter for this element type<br/>
|
44
|
+
> and set it equal to 1.<br/>
|
45
|
+
> 2. Add a summary S containing identities and counts of the k most frequent items to the back of queue Q.<br/>
|
46
|
+
> 3. Delete all local counters<br/>
|
47
|
+
> 4. For each type named in S:<br/>
|
48
|
+
> If a global counter exists for this type:<br/>
|
49
|
+
> Add to it the count recorded in S.<br/>
|
50
|
+
> Otherwise:<br/>
|
51
|
+
> Create a new global counter for this element type<br/>
|
52
|
+
> and set it equal to the count recorded in S.<br/>
|
53
|
+
> 5. Add the count of the kth largest type in S to δ.<br/>
|
54
|
+
> 6. If sizeOf(Q) > N/b:<br/>
|
55
|
+
> (a) Remove the summary S' from the front of Q and subtract the count of the kth largest type in S' from δ.<br/>
|
56
|
+
> (b) For all element types named in S':<br/>
|
57
|
+
> Subtract from their global counters the counts<br/>
|
58
|
+
> recorded in S'<br/>
|
59
|
+
> If a counter is decremented to zero:<br/>
|
60
|
+
> Delete it.<br/>
|
61
|
+
> (c) Output the identity and value of each global counter > δ.
|
62
|
+
>
|
63
|
+
> — <cite>Golab, DeHaan, Demaine, López-Ortiz and Munro. Identifying Frequent Items in Sliding Windows over On-Line Packet Streams, 2003</cite>
|
64
|
+
|
65
|
+
|
66
|
+
## Usage
|
67
|
+
|
68
|
+
require 'frequent-algorithm'
|
69
|
+
|
70
|
+
# data is pi to 1000 digits
|
71
|
+
pi = File.read('test/frequent/test_data_pi').strip
|
72
|
+
data = pi.scan(/./).each_slice(b)
|
73
|
+
|
74
|
+
N = 100 # size of main window
|
75
|
+
b = 20 # size of basic window
|
76
|
+
k = 3 # we are interested in top-3 numerals in pi
|
77
|
+
|
78
|
+
alg = Frequent::Algorithm.new(N, b, k)
|
79
|
+
|
80
|
+
# read in and process the 1st basic window
|
81
|
+
alg.process(data.next)
|
82
|
+
|
83
|
+
# and the top-3 numerals are?
|
84
|
+
top3 = alg.statistics.report
|
85
|
+
puts top3
|
86
|
+
|
87
|
+
# lather, rinse and repeat
|
88
|
+
alg.process(data.next)
|
89
|
+
|
90
|
+
|
91
|
+
## Development
|
92
|
+
|
93
|
+
The development of this gem requires the following:
|
94
|
+
|
95
|
+
* [Ruby 1.9.3 or greater](https://www.ruby-lang.org/en/)
|
96
|
+
* [rubygems](https://rubygems.org/pages/download)
|
97
|
+
* [`bundler`](https://github.com/bundler/bundler)
|
98
|
+
* [`rake`](https://github.com/ruby/rake)
|
99
|
+
* [`minitest`](https://rubygems.org/gems/minitest) (unit testing)
|
100
|
+
* [`yard`](https://rubygems.org/gems/yard) (documentation)
|
101
|
+
* [`rdiscount`](https://rubygems.org/gems/rdiscount) (Markdown)
|
102
|
+
|
103
|
+
Building, testing and release of this rubygem uses the following
|
104
|
+
`rake` commands:
|
105
|
+
|
106
|
+
|
107
|
+
rake clean # Remove any temporary products
|
108
|
+
rake clobber # Remove any generated file
|
109
|
+
rake test # Execute unit tests
|
110
|
+
rake build # Build frequent-algorithm-n.n.n.gem into the pkg directory
|
111
|
+
rake install # Build and install frequent-algorithm-n.n.n.gem into system gems
|
112
|
+
rake release # Create tag vn.n.n and build and push
|
113
|
+
# frequent-algorithm-n.n.n.gem to Rubygems
|
114
|
+
|
115
|
+
|
116
|
+
### Documentation
|
117
|
+
|
118
|
+
`frequent-algorithm` uses [`yard`](https://rubygems.org/gems/yard) and
|
119
|
+
[`rdiscount`](https://rubygems.org/gems/rdiscount) for Markdown documentation.
|
120
|
+
Check out [Getting Started with
|
121
|
+
Yard](http://www.rubydoc.info/gems/yard/file/docs/GettingStarted.md).
|
122
|
+
|
123
|
+
|
124
|
+
### Unit Testing
|
125
|
+
|
126
|
+
`frequent-algorithm` uses
|
127
|
+
[`MiniTest::Unit`](https://github.com/seattlerb/minitest) for
|
128
|
+
unit testing.
|
129
|
+
|
130
|
+
|
131
|
+
### Releasing
|
132
|
+
|
133
|
+
Please refer to Publishing To Rubygems.org in the
|
134
|
+
[Rubygems Guide](http://guides.rubygems.org/make-your-own-gem/).
|
135
|
+
|
136
|
+
|
137
|
+
### Contributing
|
138
|
+
|
139
|
+
1. Fork it
|
140
|
+
2. Begin work on `dev-branch` (`git fetch && git checkout dev-branch`)
|
141
|
+
3. Create your feature branch (`git branch my-new-feature && git checkout
|
142
|
+
my-new-feature`)
|
143
|
+
4. Commit your changes (`git commit -am 'Add some feature'`)
|
144
|
+
5. Push to the branch (`git push origin my-new-feature:dev-branch`)
|
145
|
+
6. Create new Pull Request
|
146
|
+
|
147
|
+
You may wish to read the [Git book online](http://git-scm.com/book/en/v2).
|
148
|
+
|
149
|
+
|
150
|
+
## Changelog
|
151
|
+
|
152
|
+
Please see the {file:CHANGELOG} for this gem's release history.
|
153
|
+
|
154
|
+
|
155
|
+
## License
|
156
|
+
|
157
|
+
frequent-algorithm is provided under the terms of the MIT license.
|
158
|
+
|
159
|
+
Copyright © 2015, Willie Tong & Brooke M. Fujita. All rights reserved. Please see the {file:LICENSE} file for further details.
|
data/lib/frequent-algorithm.rb
CHANGED
@@ -1,28 +1,28 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
require 'frequent/algorithm'
|
3
|
-
|
4
|
-
=begin
|
5
|
-
|
6
|
-
The MIT License (MIT)
|
7
|
-
|
8
|
-
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
9
|
-
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a
|
11
|
-
copy of this software and associated documentation files (the "Software"),
|
12
|
-
to deal in the Software without restriction, including without limitation
|
13
|
-
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
14
|
-
and/or sell copies of the Software, and to permit persons to whom the
|
15
|
-
Software is furnished to do so, subject to the following conditions:
|
16
|
-
|
17
|
-
The above copyright notice and this permission notice shall be included
|
18
|
-
in all copies or substantial portions of the Software.
|
19
|
-
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
23
|
-
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
25
|
-
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
26
|
-
IN THE SOFTWARE.
|
27
|
-
|
28
|
-
=end
|
1
|
+
# coding: utf-8
|
2
|
+
require 'frequent/algorithm'
|
3
|
+
|
4
|
+
=begin
|
5
|
+
|
6
|
+
The MIT License (MIT)
|
7
|
+
|
8
|
+
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
11
|
+
copy of this software and associated documentation files (the "Software"),
|
12
|
+
to deal in the Software without restriction, including without limitation
|
13
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
14
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
15
|
+
Software is furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included
|
18
|
+
in all copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
23
|
+
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
25
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
26
|
+
IN THE SOFTWARE.
|
27
|
+
|
28
|
+
=end
|
data/lib/frequent/algorithm.rb
CHANGED
@@ -1,182 +1,201 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
require 'frequent/version'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
# *
|
16
|
-
# * require
|
17
|
-
#
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
# @return [Integer] the number of items in
|
22
|
-
attr_reader :
|
23
|
-
# @return [Integer] the number of
|
24
|
-
attr_reader :
|
25
|
-
# @return [
|
26
|
-
attr_reader :
|
27
|
-
# @return [Hash<Object,Integer
|
28
|
-
attr_reader :
|
29
|
-
# @return [Integer]
|
30
|
-
attr_reader :
|
31
|
-
|
32
|
-
|
33
|
-
#
|
34
|
-
|
35
|
-
# @
|
36
|
-
|
37
|
-
|
38
|
-
#
|
39
|
-
#
|
40
|
-
# @
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
#
|
130
|
-
#
|
131
|
-
# @return [Integer] the
|
132
|
-
def
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
1
|
+
# coding: utf-8
|
2
|
+
require 'frequent/version'
|
3
|
+
require 'thread'
|
4
|
+
|
5
|
+
module Frequent
|
6
|
+
|
7
|
+
ERR_BADLIST = "List cannot be nil or empty".freeze
|
8
|
+
ERR_BADK = "k must be between 1 and %s".freeze
|
9
|
+
|
10
|
+
# `Frequent::Algorithm` is the Ruby implementation of the
|
11
|
+
# Demaine et al. FREQUENT algorithm for calculating
|
12
|
+
# top-k items in a stream.
|
13
|
+
#
|
14
|
+
# The aims of this algorithm are:
|
15
|
+
# * use limited memory
|
16
|
+
# * require constant processing time per item
|
17
|
+
# * require a single-pass only
|
18
|
+
#
|
19
|
+
class Algorithm
|
20
|
+
|
21
|
+
# @return [Integer] the number of items in the main window
|
22
|
+
attr_reader :n
|
23
|
+
# @return [Integer] the number of items in a basic window
|
24
|
+
attr_reader :b
|
25
|
+
# @return [Integer] the number of top item categories to track
|
26
|
+
attr_reader :k
|
27
|
+
# @return [Array<Hash<Object,Integer>>] global queue for basic window summaries
|
28
|
+
attr_reader :queue
|
29
|
+
# @return [Hash<Object,Integer>] global mapping of items and counts
|
30
|
+
attr_reader :statistics
|
31
|
+
# @return [Integer] minimum threshold for membership in top-k items
|
32
|
+
attr_reader :delta
|
33
|
+
# @return [Hash<Object,Integer>] latest top k elements and their counts
|
34
|
+
attr_reader :topk
|
35
|
+
# @return [Array[Object]] the window of elements of size b
|
36
|
+
attr_reader :window
|
37
|
+
|
38
|
+
# Initializes this top-k frequency-calculating instance.
|
39
|
+
#
|
40
|
+
# @param [Integer] n number of items in the main window
|
41
|
+
# @param [Integer] b number of items in a basic window
|
42
|
+
# @param [Integer] k number of top item categories to track
|
43
|
+
# @raise [ArgumentError] if n is not greater than 0
|
44
|
+
# @raise [ArgumentError] if b is not greater than 0
|
45
|
+
# @raise [ArgumentError] if k is not greater than 0
|
46
|
+
# @raise [ArgumentError] if n/b is not greater than 1
|
47
|
+
def initialize(n, b, k=1)
|
48
|
+
@lock = Mutex.new
|
49
|
+
|
50
|
+
if n <= 0
|
51
|
+
raise ArgumentError.new('n must be greater than 0')
|
52
|
+
end
|
53
|
+
if b <= 0
|
54
|
+
raise ArgumentError.new('b must be greater than 0')
|
55
|
+
end
|
56
|
+
if k <= 0
|
57
|
+
raise ArgumentError.new('k must be greater than 0')
|
58
|
+
end
|
59
|
+
if n/b < 1
|
60
|
+
raise ArgumentError.new('n/b must be greater than 1')
|
61
|
+
end
|
62
|
+
@n = n
|
63
|
+
@b = b
|
64
|
+
@k = k
|
65
|
+
|
66
|
+
@queue = []
|
67
|
+
@statistics = {}
|
68
|
+
@delta = 0
|
69
|
+
@topk = {}
|
70
|
+
@window = []
|
71
|
+
end
|
72
|
+
|
73
|
+
# Processes a single basic window of b items, by first adding
|
74
|
+
# a summary of this basic window in the internal global queue;
|
75
|
+
# and then updating the global statistics accordingly.
|
76
|
+
#
|
77
|
+
# @param [Object] an object from a data stream
|
78
|
+
def process(element)
|
79
|
+
@lock.synchronize do
|
80
|
+
@window << element
|
81
|
+
if @window.length == @b
|
82
|
+
|
83
|
+
# Step 1
|
84
|
+
summary = {}
|
85
|
+
@window.each do |e|
|
86
|
+
if summary.key? e
|
87
|
+
summary[e] += 1
|
88
|
+
else
|
89
|
+
summary[e] = 1
|
90
|
+
end
|
91
|
+
end
|
92
|
+
@window.clear #current window cleared
|
93
|
+
|
94
|
+
# Step 2
|
95
|
+
@queue << summary
|
96
|
+
|
97
|
+
# Step 3
|
98
|
+
# Done, implicitly
|
99
|
+
|
100
|
+
# Step 4
|
101
|
+
summary.each do |k,v|
|
102
|
+
if @statistics.key? k
|
103
|
+
@statistics[k] += v
|
104
|
+
else
|
105
|
+
@statistics[k] = v
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Step 5
|
110
|
+
@delta += kth_largest(summary.values, @k)
|
111
|
+
|
112
|
+
# Step 6 - sizeOf(Q) > N/b
|
113
|
+
if @queue.length > @n/@b
|
114
|
+
# a
|
115
|
+
summary_p = @queue.shift
|
116
|
+
@delta -= kth_largest(summary_p.values, @k)
|
117
|
+
|
118
|
+
# b
|
119
|
+
summary_p.each { |k,v| @statistics[k] -= v }
|
120
|
+
@statistics.delete_if { |k,v| v <= 0 }
|
121
|
+
|
122
|
+
#c
|
123
|
+
@topk = @statistics.select { |k,v| v > @delta }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Return the latest Tok K elements
|
130
|
+
#
|
131
|
+
# @return [Hash<Object,Integer>] a hash which contains the current top K elements and their counts
|
132
|
+
def report
|
133
|
+
@topk
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns the version for this gem.
|
137
|
+
#
|
138
|
+
# @return [String] the version for this gem.
|
139
|
+
def version
|
140
|
+
Frequent::VERSION
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
# Given a list of numbers and a number k which should be
|
145
|
+
# between 1 and the length of the given list, return the
|
146
|
+
# element x in the list that is larger than exactly k-1
|
147
|
+
# other elements in the list.
|
148
|
+
#
|
149
|
+
# @param [Array] list of integers
|
150
|
+
# @return [Integer] the kth largest element in list
|
151
|
+
def kth_largest(list, k)
|
152
|
+
raise ArgumentError.new(ERR_BADLIST) if list.nil? or list.empty?
|
153
|
+
raise ArgumentError.new(ERR_BADK) if k < 1
|
154
|
+
|
155
|
+
ulist = list.uniq
|
156
|
+
k = ulist.size if k > ulist.size
|
157
|
+
|
158
|
+
def quickselect(aset, k)
|
159
|
+
p = rand(aset.size)
|
160
|
+
|
161
|
+
lower = aset.select { |e| e < aset[p] }
|
162
|
+
upper = aset.select { |e| e > aset[p] }
|
163
|
+
|
164
|
+
if k <= lower.size
|
165
|
+
quickselect(lower, k)
|
166
|
+
elsif k > aset.size - upper.size
|
167
|
+
quickselect(upper, k - (aset.size - upper.size))
|
168
|
+
else
|
169
|
+
aset[p]
|
170
|
+
end
|
171
|
+
end
|
172
|
+
quickselect(ulist, ulist.size+1-k)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
=begin
|
178
|
+
|
179
|
+
The MIT License (MIT)
|
180
|
+
|
181
|
+
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
182
|
+
|
183
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
184
|
+
copy of this software and associated documentation files (the "Software"),
|
185
|
+
to deal in the Software without restriction, including without limitation
|
186
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
187
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
188
|
+
Software is furnished to do so, subject to the following conditions:
|
189
|
+
|
190
|
+
The above copyright notice and this permission notice shall be included
|
191
|
+
in all copies or substantial portions of the Software.
|
192
|
+
|
193
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
194
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
195
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
196
|
+
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
197
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
198
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
199
|
+
IN THE SOFTWARE.
|
200
|
+
|
201
|
+
=end
|
data/lib/frequent/version.rb
CHANGED
@@ -1,38 +1,38 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
# `Frequent` is the namespace for objects implementing
|
4
|
-
# the Demaine et al. FREQUENT algorithm for finding
|
5
|
-
# the most frequently-appearing items (top-k) in a
|
6
|
-
# data stream in sliding windows.
|
7
|
-
#
|
8
|
-
# `Frequent::Algorithm` is the implementation class.
|
9
|
-
module Frequent
|
10
|
-
# Version string for this Rubygem.
|
11
|
-
VERSION = '0.0.
|
12
|
-
end
|
13
|
-
|
14
|
-
=begin
|
15
|
-
|
16
|
-
The MIT License (MIT)
|
17
|
-
|
18
|
-
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
19
|
-
|
20
|
-
Permission is hereby granted, free of charge, to any person obtaining a
|
21
|
-
copy of this software and associated documentation files (the "Software"),
|
22
|
-
to deal in the Software without restriction, including without limitation
|
23
|
-
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
24
|
-
and/or sell copies of the Software, and to permit persons to whom the
|
25
|
-
Software is furnished to do so, subject to the following conditions:
|
26
|
-
|
27
|
-
The above copyright notice and this permission notice shall be included
|
28
|
-
in all copies or substantial portions of the Software.
|
29
|
-
|
30
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
31
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
32
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
33
|
-
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
34
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
35
|
-
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
36
|
-
IN THE SOFTWARE.
|
37
|
-
|
38
|
-
=end
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# `Frequent` is the namespace for objects implementing
|
4
|
+
# the Demaine et al. FREQUENT algorithm for finding
|
5
|
+
# the most frequently-appearing items (top-k) in a
|
6
|
+
# data stream in sliding windows.
|
7
|
+
#
|
8
|
+
# `Frequent::Algorithm` is the implementation class.
|
9
|
+
module Frequent
|
10
|
+
# Version string for this Rubygem.
|
11
|
+
VERSION = '0.0.4'
|
12
|
+
end
|
13
|
+
|
14
|
+
=begin
|
15
|
+
|
16
|
+
The MIT License (MIT)
|
17
|
+
|
18
|
+
Copyright (c) 2015 Willie Tong, Brooke M. Fujita
|
19
|
+
|
20
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
21
|
+
copy of this software and associated documentation files (the "Software"),
|
22
|
+
to deal in the Software without restriction, including without limitation
|
23
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
24
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
25
|
+
Software is furnished to do so, subject to the following conditions:
|
26
|
+
|
27
|
+
The above copyright notice and this permission notice shall be included
|
28
|
+
in all copies or substantial portions of the Software.
|
29
|
+
|
30
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
31
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
32
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
33
|
+
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
34
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
35
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
36
|
+
IN THE SOFTWARE.
|
37
|
+
|
38
|
+
=end
|
metadata
CHANGED
@@ -1,44 +1,44 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: frequent-algorithm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Willie Tong
|
8
8
|
- Brooke M. Fujita
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-
|
12
|
+
date: 2015-05-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
20
|
version: '0'
|
21
|
-
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
24
|
requirements:
|
23
25
|
- - ">="
|
24
26
|
- !ruby/object:Gem::Version
|
25
27
|
version: '0'
|
26
|
-
prerelease: false
|
27
|
-
type: :development
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: minitest
|
30
|
-
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
32
|
- - ">="
|
33
33
|
- !ruby/object:Gem::Version
|
34
34
|
version: '0'
|
35
|
-
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
38
|
requirements:
|
37
39
|
- - ">="
|
38
40
|
- !ruby/object:Gem::Version
|
39
41
|
version: '0'
|
40
|
-
prerelease: false
|
41
|
-
type: :development
|
42
42
|
description: |
|
43
43
|
frequent-algorithm is a Ruby implementation of the Demaine et al FREQUENT algorithm for identifying frequent items in a data stream in sliding windows (c.f Identifying Frequent Items in Sliding Windows over On-Line Packet Streams, 2003).
|
44
44
|
email:
|
@@ -59,7 +59,7 @@ homepage: https://github.com/buruzaemon/frequent-algorithm
|
|
59
59
|
licenses:
|
60
60
|
- MIT
|
61
61
|
metadata: {}
|
62
|
-
post_install_message:
|
62
|
+
post_install_message:
|
63
63
|
rdoc_options: []
|
64
64
|
require_paths:
|
65
65
|
- lib
|
@@ -74,9 +74,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
requirements: []
|
77
|
-
rubyforge_project:
|
77
|
+
rubyforge_project:
|
78
78
|
rubygems_version: 2.4.5
|
79
|
-
signing_key:
|
79
|
+
signing_key:
|
80
80
|
specification_version: 4
|
81
|
-
summary: Identifies frequent items in a data stream in sliding windows using the Demaine
|
81
|
+
summary: Identifies frequent items in a data stream in sliding windows using the Demaine
|
82
|
+
et al FREQUENT algorithm.
|
82
83
|
test_files: []
|
84
|
+
has_rdoc:
|