datasift 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/Rakefile +36 -0
- data/VERSION +1 -0
- data/config.yml +2 -0
- data/datasift.gemspec +28 -0
- data/examples/cost.rb +123 -0
- data/examples/football-buffered.rb +50 -0
- data/examples/football.rb +52 -0
- data/examples/twitter-track.rb +60 -0
- data/lib/DataSift/apiclient.rb +79 -0
- data/lib/DataSift/definition.rb +154 -0
- data/lib/DataSift/exceptions.rb +16 -0
- data/lib/DataSift/mockapiclient.rb +55 -0
- data/lib/DataSift/stream_consumer.rb +124 -0
- data/lib/DataSift/stream_consumer_http.rb +160 -0
- data/lib/DataSift/user.rb +104 -0
- data/lib/datasift.rb +19 -0
- data/test/helper.rb +24 -0
- data/test/test_definition.rb +282 -0
- data/test/test_live_api.rb +100 -0
- data/test/test_user.rb +68 -0
- data/test/testdata.yml +4 -0
- metadata +135 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# definition.rb - This file contains the Definition class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The User class represents a user of the API. Applications should start their
|
9
|
+
# API interactions by creating an instance of this class. Once initialised it
|
10
|
+
# provides factory methods for all of the functionality in the API.
|
11
|
+
|
12
|
+
module DataSift
|
13
|
+
|
14
|
+
# Definition class.
|
15
|
+
#
|
16
|
+
# == Introduction
|
17
|
+
#
|
18
|
+
# The Definition class represents a stream definition.
|
19
|
+
#
|
20
|
+
class Definition
|
21
|
+
attr_reader :csdl, :total_cost, :created_at
|
22
|
+
|
23
|
+
# Constructor. A User object is required, and you can optionally supply a
|
24
|
+
# default CSDL string.
|
25
|
+
# === Parameters
|
26
|
+
#
|
27
|
+
# * +user+ - The DataSift::User object.
|
28
|
+
# * +csdl+ - Optional default CSDL string.
|
29
|
+
# * +hash+ - Optional default hash string.
|
30
|
+
#
|
31
|
+
def initialize(user, csdl = '', hash = false)
|
32
|
+
raise InvalidDataError, 'Please supply a valid User object when creating a Definition object.' unless user.is_a? DataSift::User
|
33
|
+
@user = user
|
34
|
+
clearHash()
|
35
|
+
@hash = hash
|
36
|
+
self.csdl = csdl
|
37
|
+
end
|
38
|
+
|
39
|
+
# CSDL setter. Strips the incoming string and resets the hash if it's changed.
|
40
|
+
def csdl=(csdl)
|
41
|
+
raise InvalidDataError, 'The CSDL must be a string.' unless csdl.is_a? String
|
42
|
+
csdl.strip!
|
43
|
+
clearHash() unless csdl == @csdl
|
44
|
+
@csdl = csdl
|
45
|
+
end
|
46
|
+
|
47
|
+
# Hash getter. If the hash has not yet been obtained the CSDL will be
|
48
|
+
# compiled first.
|
49
|
+
def hash
|
50
|
+
if @hash == false
|
51
|
+
begin
|
52
|
+
compile()
|
53
|
+
rescue DataSift::CompileFailedError
|
54
|
+
# Ignore
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
@hash
|
59
|
+
end
|
60
|
+
|
61
|
+
# Reset the hash to false. The effect of this is to mark the definition as
|
62
|
+
# requiring compilation.
|
63
|
+
def clearHash()
|
64
|
+
@hash = false
|
65
|
+
@total_cost = false
|
66
|
+
@created_at = false
|
67
|
+
end
|
68
|
+
|
69
|
+
# Call the DataSift API to compile this definition. On success it will
|
70
|
+
# store the returned hash.
|
71
|
+
def compile()
|
72
|
+
raise InvalidDataError, 'Cannot compile an empty definition.' unless @csdl.length > 0
|
73
|
+
|
74
|
+
begin
|
75
|
+
res = @user.callAPI('compile', { 'csdl' => @csdl })
|
76
|
+
|
77
|
+
if res.has_key?('hash')
|
78
|
+
@hash = res['hash']
|
79
|
+
else
|
80
|
+
raise CompileFailedError, 'Compiled successfully but no hash in the response'
|
81
|
+
end
|
82
|
+
|
83
|
+
if res.has_key?('cost')
|
84
|
+
@total_cost = Integer(res['cost'])
|
85
|
+
else
|
86
|
+
raise CompileFailedError, 'Compiled successfully but no cost in the response'
|
87
|
+
end
|
88
|
+
|
89
|
+
if res.has_key?('created_at')
|
90
|
+
@created_at = Date.parse(res['created_at'])
|
91
|
+
else
|
92
|
+
raise CompileFailedError, 'Compiled successfully but no created_at in the response'
|
93
|
+
end
|
94
|
+
rescue APIError => err
|
95
|
+
clearHash()
|
96
|
+
|
97
|
+
case err.http_code
|
98
|
+
when 400
|
99
|
+
raise CompileFailedError, err
|
100
|
+
else
|
101
|
+
raise CompileFailedError, 'Unexpected APIError code: ' + err.http_code.to_s + ' [' + err + ']'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Call the DataSift API to get the cost for this definition. Returns an
|
107
|
+
# array containing...
|
108
|
+
# costs => The breakdown of running the rule
|
109
|
+
# tags => The tags associated with the rule
|
110
|
+
# total => The total cost of the rule
|
111
|
+
#
|
112
|
+
def getCostBreakdown()
|
113
|
+
raise InvalidDataError, "Cannot get the cost for an empty definition." unless @csdl.length > 0
|
114
|
+
|
115
|
+
@user.callAPI('cost', { 'hash' => self.hash })
|
116
|
+
end
|
117
|
+
|
118
|
+
# Call the DataSift API to get buffered interactions.
|
119
|
+
# === Parameters
|
120
|
+
#
|
121
|
+
# * +count+ - Optional number of interactions to return (max 200).
|
122
|
+
# * +from_id+ - Optional start ID.
|
123
|
+
#
|
124
|
+
def getBuffered(count = false, from_id = false)
|
125
|
+
raise InvalidDataError, "Cannot get buffered interactions for an empty definition." unless @csdl.length > 0
|
126
|
+
|
127
|
+
params = { 'hash' => self.hash }
|
128
|
+
|
129
|
+
if count
|
130
|
+
params['count'] = count
|
131
|
+
end
|
132
|
+
|
133
|
+
if from_id
|
134
|
+
params['interaction_id'] = from_id
|
135
|
+
end
|
136
|
+
|
137
|
+
retval = @user.callAPI('stream', params)
|
138
|
+
|
139
|
+
raise APIError, 'No data in the response' unless retval.has_key?('stream')
|
140
|
+
|
141
|
+
retval['stream']
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns a StreamConsumer-derived object for this definition, for the
|
145
|
+
# given type.
|
146
|
+
# === Parameters
|
147
|
+
#
|
148
|
+
# * +type+ - The consumer type for which to construct a consumer.
|
149
|
+
#
|
150
|
+
def getConsumer(type = nil, on_interaction = nil, on_stopped = nil)
|
151
|
+
StreamConsumer.factory(@user, type, self)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module DataSift
|
2
|
+
class AccessDeniedError < StandardError; end
|
3
|
+
class CompileFailedError < StandardError; end
|
4
|
+
class InvalidDataError < StandardError; end
|
5
|
+
class NotYetImplementedError < StandardError; end
|
6
|
+
class RateLimitExceededError < StandardError; end
|
7
|
+
class StreamError < StandardError; end
|
8
|
+
|
9
|
+
class APIError < StandardError
|
10
|
+
attr_reader :http_code
|
11
|
+
|
12
|
+
def initialize(http_code)
|
13
|
+
@http_code = http_code
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#
|
2
|
+
# mockapiclient.rb - This file contains the MockApiClient class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The MockApiClient class implements a fake DataSift API interface.
|
9
|
+
|
10
|
+
module DataSift
|
11
|
+
# MockApiCLient class.
|
12
|
+
#
|
13
|
+
# == Introduction
|
14
|
+
#
|
15
|
+
# The ApiClient class implements a fake DataSift API interface.
|
16
|
+
#
|
17
|
+
class MockApiClient
|
18
|
+
# Set the response to be returned by the call method
|
19
|
+
# === Parameters
|
20
|
+
#
|
21
|
+
# * +code+ - The HTTP response code
|
22
|
+
# * +data+ - The dictionary that would have come from the response body
|
23
|
+
# * +rate_limit+ - The new rate_limit value
|
24
|
+
# * +rate_limit_remaining+ - The new rate_limit_remaining value
|
25
|
+
def setResponse(code, data, rate_limit, rate_limit_remaining)
|
26
|
+
@response = {
|
27
|
+
'response_code' => code,
|
28
|
+
'data' => data,
|
29
|
+
'rate_limit' => rate_limit,
|
30
|
+
'rate_limit_remaining' => rate_limit_remaining,
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Clear the response so we throw an exception if we get called again
|
35
|
+
# without a new response being set.
|
36
|
+
#
|
37
|
+
def clearResponse()
|
38
|
+
@response = false
|
39
|
+
end
|
40
|
+
|
41
|
+
# Fake a call to a DataSift API endpoint.
|
42
|
+
# === Parameters
|
43
|
+
#
|
44
|
+
# * +endpoint+ - The endpoint of the API call.
|
45
|
+
# * +params+ - The parameters to be passed along with the request.
|
46
|
+
# * +username+ - The username for the Auth header
|
47
|
+
# * +api_key+ - The API key for the Auth header
|
48
|
+
def call(username, api_key, endpoint, params = {}, user_agent = 'DataSiftPHP/0.0')
|
49
|
+
if !@response
|
50
|
+
raise StandardError, 'Expected response not set in mock object'
|
51
|
+
end
|
52
|
+
@response
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#
|
2
|
+
# stream_consumer.rb - This file contains the StreamConsumer class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The StreamConsumer class is base class for various stream consumers.
|
9
|
+
|
10
|
+
module DataSift
|
11
|
+
|
12
|
+
# StreamConsumer class.
|
13
|
+
#
|
14
|
+
class StreamConsumer
|
15
|
+
TYPE_HTTP = 'HTTP'
|
16
|
+
|
17
|
+
STATE_STOPPED = 0
|
18
|
+
STATE_STARTING = 1
|
19
|
+
STATE_RUNNING = 2
|
20
|
+
STATE_STOPPING = 3
|
21
|
+
|
22
|
+
# Factory function. Creates a StreamConsumer-derived object for the given
|
23
|
+
# type.
|
24
|
+
# === Parameters
|
25
|
+
#
|
26
|
+
# * +type+ - Use the TYPE_ constants
|
27
|
+
# * +definition+ - CSDL string or a Definition object.
|
28
|
+
#
|
29
|
+
def self.factory(user, type, definition)
|
30
|
+
type ||= TYPE_HTTP
|
31
|
+
@klass = Module.const_get('DataSift').const_get('StreamConsumer_' + type)
|
32
|
+
@klass.new(user, definition)
|
33
|
+
end
|
34
|
+
|
35
|
+
attr_accessor :auto_reconnect
|
36
|
+
attr_reader :state, :stop_reason
|
37
|
+
|
38
|
+
# Constructor. Do not use this directly, use the factory method instead.
|
39
|
+
# === Parameters
|
40
|
+
#
|
41
|
+
# * +user+ - The user this consumer will run as.
|
42
|
+
# * +definition+ - CSDL string or a Definition object.
|
43
|
+
#
|
44
|
+
def initialize(user, definition)
|
45
|
+
raise InvalidDataError, 'Please supply a valid User object when creating a Definition object.' unless user.is_a? DataSift::User
|
46
|
+
|
47
|
+
if definition.is_a? String
|
48
|
+
@definition = user.createDefinition(definition)
|
49
|
+
elsif definition.is_a? Definition
|
50
|
+
@definition = definition
|
51
|
+
else
|
52
|
+
raise InvalidDataError, 'The definition must be a CSDL string or a DataSift_Definition object'
|
53
|
+
end
|
54
|
+
|
55
|
+
@user = user
|
56
|
+
@auto_reconnect = true
|
57
|
+
@stop_reason = 'Unknown reason'
|
58
|
+
@state = STATE_STOPPED
|
59
|
+
|
60
|
+
# Compile the definition to ensure it's valid for use
|
61
|
+
@definition.compile()
|
62
|
+
end
|
63
|
+
|
64
|
+
# This is called when the consumer is stopped.
|
65
|
+
# === Parameters
|
66
|
+
#
|
67
|
+
# * +reason+ - The reason why the consumer stopped.
|
68
|
+
#
|
69
|
+
def onStopped(&block)
|
70
|
+
if block_given?
|
71
|
+
@on_stopped = block
|
72
|
+
self
|
73
|
+
else
|
74
|
+
@on_stopped
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Once an instance of a StreamConsumer is ready for use, call this to
|
79
|
+
# start consuming. Extending classes should implement onStart to handle
|
80
|
+
# actually starting.
|
81
|
+
# === Parameters
|
82
|
+
#
|
83
|
+
# * +auto_reconnect+ - Whether the consumer should automatically reconnect.
|
84
|
+
# * +block+ - An optional block to receive incoming interactions.
|
85
|
+
#
|
86
|
+
def consume(auto_reconnect = true, &block)
|
87
|
+
@auto_reconnect = auto_reconnect;
|
88
|
+
|
89
|
+
# Start consuming
|
90
|
+
@state = STATE_STARTING
|
91
|
+
onStart(&block)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Called when the consumer should start consuming the stream.
|
95
|
+
#
|
96
|
+
def onStart()
|
97
|
+
puts 'onStart method has not been overridden!'
|
98
|
+
end
|
99
|
+
|
100
|
+
# This method can be called at any time to *request* that the consumer
|
101
|
+
# stop consuming. This method sets the state to STATE_STOPPING and it's
|
102
|
+
# up to the consumer implementation to notice that this has changed, stop
|
103
|
+
# consuming and call the onStopped method.
|
104
|
+
#
|
105
|
+
def stop()
|
106
|
+
raise InvalidDataError, 'Consumer state must be RUNNING before it can be stopped' unless @state = StreamConsumer::STATE_RUNNING
|
107
|
+
@state = StreamConsumer::STATE_STOPPING
|
108
|
+
end
|
109
|
+
|
110
|
+
# Default implementation of onStop. It's unlikely that this method will
|
111
|
+
# ever be used in isolation, but rather it should be called as the final
|
112
|
+
# step in the extending class's implementation.
|
113
|
+
# === Parameters
|
114
|
+
#
|
115
|
+
# * +reason+ - The reason why the consumer stopped.
|
116
|
+
#
|
117
|
+
def onStop(reason = '')
|
118
|
+
reason = 'Unexpected' unless @state != StreamConsumer::STATE_STOPPING and reason.length == 0
|
119
|
+
@state = StreamConsumer::STATE_STOPPED
|
120
|
+
@stop_reason = reason
|
121
|
+
onStopped.call(reason) unless onStopped.nil?
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
#
|
2
|
+
# stream_consumer_http.rb - This file contains the StreamConsumer_HTTP class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The StreamConsumer_HTTP class implements HTTP streaming.
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../')
|
11
|
+
|
12
|
+
require 'uri'
|
13
|
+
require 'socket'
|
14
|
+
require 'yajl'
|
15
|
+
|
16
|
+
module DataSift
|
17
|
+
|
18
|
+
class StreamConsumer_HTTP < StreamConsumer
|
19
|
+
|
20
|
+
# Constructor. Requires valid user and definition objects.
|
21
|
+
def initialize(user, definition)
|
22
|
+
super
|
23
|
+
end
|
24
|
+
|
25
|
+
def onStart(&block)
|
26
|
+
begin
|
27
|
+
reconnect() unless !@socket.nil? and !@socket.closed?
|
28
|
+
|
29
|
+
parser = Yajl::Parser.new
|
30
|
+
parser.on_parse_complete = block if block_given?
|
31
|
+
if @response_head[:headers]["Transfer-Encoding"] == 'chunked'
|
32
|
+
if block_given?
|
33
|
+
chunkLeft = 0
|
34
|
+
while !@socket.eof? && (line = @socket.gets)
|
35
|
+
break if line.match /^0.*?\r\n/
|
36
|
+
next if line == "\r\n"
|
37
|
+
size = line.hex
|
38
|
+
json = @socket.read(size)
|
39
|
+
next if json.nil?
|
40
|
+
chunkLeft = size-json.size
|
41
|
+
if chunkLeft == 0
|
42
|
+
parser << json
|
43
|
+
else
|
44
|
+
# received only part of the chunk, grab the rest
|
45
|
+
parser << @socket.read(chunkLeft)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
else
|
49
|
+
raise StreamError, 'Chunked responses detected, but no block given to handle the chunks.'
|
50
|
+
end
|
51
|
+
else
|
52
|
+
content_type = @response_head[:headers]['Content-Type'].split(';')
|
53
|
+
content_type = content_type.first
|
54
|
+
if ALLOWED_MIME_TYPES.include?(content_type)
|
55
|
+
case @response_head[:headers]['Content-Encoding']
|
56
|
+
when 'gzip'
|
57
|
+
return Yajl::Gzip::StreamReader.parse(@socket, opts, &block)
|
58
|
+
when 'deflate'
|
59
|
+
return Yajl::Deflate::StreamReader.parse(@socket, opts.merge({:deflate_options => -Zlib::MAX_WBITS}), &block)
|
60
|
+
when 'bzip2'
|
61
|
+
return Yajl::Bzip2::StreamReader.parse(@socket, opts, &block)
|
62
|
+
else
|
63
|
+
return parser.parse(@socket)
|
64
|
+
end
|
65
|
+
else
|
66
|
+
raise StreamError, 'Unhandled response MIME type ' + content_type
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end while @auto_reconnect and @state == StreamConsumer::STATE_RUNNING
|
70
|
+
|
71
|
+
disconnect()
|
72
|
+
|
73
|
+
if @state == StreamConsumer::STATE_STOPPING
|
74
|
+
@stop_reason = 'Stop requested'
|
75
|
+
else
|
76
|
+
@stop_reason = 'Connection dropped'
|
77
|
+
end
|
78
|
+
|
79
|
+
onStop(@stop_reason)
|
80
|
+
end
|
81
|
+
|
82
|
+
def reconnect()
|
83
|
+
uri = URI.parse('http://' + User::STREAM_BASE_URL + @definition.hash +
|
84
|
+
'?username=' + CGI.escape(@user.username) + '&api_key=' + CGI.escape(@user.api_key))
|
85
|
+
|
86
|
+
user_agent = @user.getUserAgent()
|
87
|
+
|
88
|
+
request = "GET #{uri.path}#{uri.query ? "?"+uri.query : nil} HTTP/1.1\r\n"
|
89
|
+
request << "Host: #{uri.host}\r\n"
|
90
|
+
request << "User-Agent: #{user_agent}\r\n"
|
91
|
+
request << "Accept: */*\r\n"
|
92
|
+
request << "\r\n"
|
93
|
+
|
94
|
+
connection_delay = 0
|
95
|
+
|
96
|
+
begin
|
97
|
+
# Close the socket if it's open
|
98
|
+
disconnect()
|
99
|
+
|
100
|
+
# Back off a bit if required
|
101
|
+
sleep(connection_delay) if connection_delay > 0
|
102
|
+
|
103
|
+
begin
|
104
|
+
@socket = TCPSocket.new(uri.host, uri.port)
|
105
|
+
|
106
|
+
@socket.write(request)
|
107
|
+
@response_head = {}
|
108
|
+
@response_head[:headers] = {}
|
109
|
+
|
110
|
+
# Read the headers
|
111
|
+
@socket.each_line do |line|
|
112
|
+
if line == "\r\n" # end of the headers
|
113
|
+
break
|
114
|
+
else
|
115
|
+
header = line.split(": ")
|
116
|
+
if header.size == 1
|
117
|
+
header = header[0].split(" ")
|
118
|
+
@response_head[:version] = header[0]
|
119
|
+
@response_head[:code] = header[1].to_i
|
120
|
+
@response_head[:msg] = header[2]
|
121
|
+
else
|
122
|
+
@response_head[:headers][header[0]] = header[1].strip
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if @response_head[:code] == 200
|
128
|
+
# Success!
|
129
|
+
@state = StreamConsumer::STATE_RUNNING
|
130
|
+
elsif @response_head[:code] == 404
|
131
|
+
raise StreamError, 'Hash not found!'
|
132
|
+
else
|
133
|
+
puts 'Connection failed: ' + @response_head[:code] + ' ' + @response_head[:msg]
|
134
|
+
if connection_delay == 0
|
135
|
+
connection_delay = 10;
|
136
|
+
elsif connection_delay < 240
|
137
|
+
connection_delay *= 2;
|
138
|
+
else
|
139
|
+
raise StreamError, 'Connection failed: ' + @response_head[:code] + ' ' + @response_head[:msg]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
#rescue
|
143
|
+
# if connection_delay == 0
|
144
|
+
# connection_delay = 1
|
145
|
+
# elsif connection_delay <= 16
|
146
|
+
# connection_delay += 1
|
147
|
+
# else
|
148
|
+
# raise StreamError, 'Connection failed due to a network error'
|
149
|
+
# end
|
150
|
+
end
|
151
|
+
end while @state != StreamConsumer::STATE_RUNNING
|
152
|
+
end
|
153
|
+
|
154
|
+
def disconnect()
|
155
|
+
@socket.close if !@socket.nil? and !@socket.closed?
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|