datasift 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.md +38 -0
- data/Rakefile +36 -0
- data/VERSION +1 -0
- data/config.yml +2 -0
- data/datasift.gemspec +28 -0
- data/examples/cost.rb +123 -0
- data/examples/football-buffered.rb +50 -0
- data/examples/football.rb +52 -0
- data/examples/twitter-track.rb +60 -0
- data/lib/DataSift/apiclient.rb +79 -0
- data/lib/DataSift/definition.rb +154 -0
- data/lib/DataSift/exceptions.rb +16 -0
- data/lib/DataSift/mockapiclient.rb +55 -0
- data/lib/DataSift/stream_consumer.rb +124 -0
- data/lib/DataSift/stream_consumer_http.rb +160 -0
- data/lib/DataSift/user.rb +104 -0
- data/lib/datasift.rb +19 -0
- data/test/helper.rb +24 -0
- data/test/test_definition.rb +282 -0
- data/test/test_live_api.rb +100 -0
- data/test/test_user.rb +68 -0
- data/test/testdata.yml +4 -0
- metadata +135 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#
|
2
|
+
# definition.rb - This file contains the Definition class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The User class represents a user of the API. Applications should start their
|
9
|
+
# API interactions by creating an instance of this class. Once initialised it
|
10
|
+
# provides factory methods for all of the functionality in the API.
|
11
|
+
|
12
|
+
module DataSift
|
13
|
+
|
14
|
+
# Definition class.
|
15
|
+
#
|
16
|
+
# == Introduction
|
17
|
+
#
|
18
|
+
# The Definition class represents a stream definition.
|
19
|
+
#
|
20
|
+
class Definition
|
21
|
+
attr_reader :csdl, :total_cost, :created_at
|
22
|
+
|
23
|
+
# Constructor. A User object is required, and you can optionally supply a
|
24
|
+
# default CSDL string.
|
25
|
+
# === Parameters
|
26
|
+
#
|
27
|
+
# * +user+ - The DataSift::User object.
|
28
|
+
# * +csdl+ - Optional default CSDL string.
|
29
|
+
# * +hash+ - Optional default hash string.
|
30
|
+
#
|
31
|
+
def initialize(user, csdl = '', hash = false)
|
32
|
+
raise InvalidDataError, 'Please supply a valid User object when creating a Definition object.' unless user.is_a? DataSift::User
|
33
|
+
@user = user
|
34
|
+
clearHash()
|
35
|
+
@hash = hash
|
36
|
+
self.csdl = csdl
|
37
|
+
end
|
38
|
+
|
39
|
+
# CSDL setter. Strips the incoming string and resets the hash if it's changed.
|
40
|
+
def csdl=(csdl)
|
41
|
+
raise InvalidDataError, 'The CSDL must be a string.' unless csdl.is_a? String
|
42
|
+
csdl.strip!
|
43
|
+
clearHash() unless csdl == @csdl
|
44
|
+
@csdl = csdl
|
45
|
+
end
|
46
|
+
|
47
|
+
# Hash getter. If the hash has not yet been obtained the CSDL will be
|
48
|
+
# compiled first.
|
49
|
+
def hash
|
50
|
+
if @hash == false
|
51
|
+
begin
|
52
|
+
compile()
|
53
|
+
rescue DataSift::CompileFailedError
|
54
|
+
# Ignore
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
@hash
|
59
|
+
end
|
60
|
+
|
61
|
+
# Reset the hash to false. The effect of this is to mark the definition as
|
62
|
+
# requiring compilation.
|
63
|
+
def clearHash()
|
64
|
+
@hash = false
|
65
|
+
@total_cost = false
|
66
|
+
@created_at = false
|
67
|
+
end
|
68
|
+
|
69
|
+
# Call the DataSift API to compile this definition. On success it will
|
70
|
+
# store the returned hash.
|
71
|
+
def compile()
|
72
|
+
raise InvalidDataError, 'Cannot compile an empty definition.' unless @csdl.length > 0
|
73
|
+
|
74
|
+
begin
|
75
|
+
res = @user.callAPI('compile', { 'csdl' => @csdl })
|
76
|
+
|
77
|
+
if res.has_key?('hash')
|
78
|
+
@hash = res['hash']
|
79
|
+
else
|
80
|
+
raise CompileFailedError, 'Compiled successfully but no hash in the response'
|
81
|
+
end
|
82
|
+
|
83
|
+
if res.has_key?('cost')
|
84
|
+
@total_cost = Integer(res['cost'])
|
85
|
+
else
|
86
|
+
raise CompileFailedError, 'Compiled successfully but no cost in the response'
|
87
|
+
end
|
88
|
+
|
89
|
+
if res.has_key?('created_at')
|
90
|
+
@created_at = Date.parse(res['created_at'])
|
91
|
+
else
|
92
|
+
raise CompileFailedError, 'Compiled successfully but no created_at in the response'
|
93
|
+
end
|
94
|
+
rescue APIError => err
|
95
|
+
clearHash()
|
96
|
+
|
97
|
+
case err.http_code
|
98
|
+
when 400
|
99
|
+
raise CompileFailedError, err
|
100
|
+
else
|
101
|
+
raise CompileFailedError, 'Unexpected APIError code: ' + err.http_code.to_s + ' [' + err + ']'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Call the DataSift API to get the cost for this definition. Returns an
|
107
|
+
# array containing...
|
108
|
+
# costs => The breakdown of running the rule
|
109
|
+
# tags => The tags associated with the rule
|
110
|
+
# total => The total cost of the rule
|
111
|
+
#
|
112
|
+
def getCostBreakdown()
|
113
|
+
raise InvalidDataError, "Cannot get the cost for an empty definition." unless @csdl.length > 0
|
114
|
+
|
115
|
+
@user.callAPI('cost', { 'hash' => self.hash })
|
116
|
+
end
|
117
|
+
|
118
|
+
# Call the DataSift API to get buffered interactions.
|
119
|
+
# === Parameters
|
120
|
+
#
|
121
|
+
# * +count+ - Optional number of interactions to return (max 200).
|
122
|
+
# * +from_id+ - Optional start ID.
|
123
|
+
#
|
124
|
+
def getBuffered(count = false, from_id = false)
|
125
|
+
raise InvalidDataError, "Cannot get buffered interactions for an empty definition." unless @csdl.length > 0
|
126
|
+
|
127
|
+
params = { 'hash' => self.hash }
|
128
|
+
|
129
|
+
if count
|
130
|
+
params['count'] = count
|
131
|
+
end
|
132
|
+
|
133
|
+
if from_id
|
134
|
+
params['interaction_id'] = from_id
|
135
|
+
end
|
136
|
+
|
137
|
+
retval = @user.callAPI('stream', params)
|
138
|
+
|
139
|
+
raise APIError, 'No data in the response' unless retval.has_key?('stream')
|
140
|
+
|
141
|
+
retval['stream']
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns a StreamConsumer-derived object for this definition, for the
|
145
|
+
# given type.
|
146
|
+
# === Parameters
|
147
|
+
#
|
148
|
+
# * +type+ - The consumer type for which to construct a consumer.
|
149
|
+
#
|
150
|
+
def getConsumer(type = nil, on_interaction = nil, on_stopped = nil)
|
151
|
+
StreamConsumer.factory(@user, type, self)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module DataSift
|
2
|
+
class AccessDeniedError < StandardError; end
|
3
|
+
class CompileFailedError < StandardError; end
|
4
|
+
class InvalidDataError < StandardError; end
|
5
|
+
class NotYetImplementedError < StandardError; end
|
6
|
+
class RateLimitExceededError < StandardError; end
|
7
|
+
class StreamError < StandardError; end
|
8
|
+
|
9
|
+
class APIError < StandardError
|
10
|
+
attr_reader :http_code
|
11
|
+
|
12
|
+
def initialize(http_code)
|
13
|
+
@http_code = http_code
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#
|
2
|
+
# mockapiclient.rb - This file contains the MockApiClient class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The MockApiClient class implements a fake DataSift API interface.
|
9
|
+
|
10
|
+
module DataSift
|
11
|
+
# MockApiCLient class.
|
12
|
+
#
|
13
|
+
# == Introduction
|
14
|
+
#
|
15
|
+
# The ApiClient class implements a fake DataSift API interface.
|
16
|
+
#
|
17
|
+
class MockApiClient
|
18
|
+
# Set the response to be returned by the call method
|
19
|
+
# === Parameters
|
20
|
+
#
|
21
|
+
# * +code+ - The HTTP response code
|
22
|
+
# * +data+ - The dictionary that would have come from the response body
|
23
|
+
# * +rate_limit+ - The new rate_limit value
|
24
|
+
# * +rate_limit_remaining+ - The new rate_limit_remaining value
|
25
|
+
def setResponse(code, data, rate_limit, rate_limit_remaining)
|
26
|
+
@response = {
|
27
|
+
'response_code' => code,
|
28
|
+
'data' => data,
|
29
|
+
'rate_limit' => rate_limit,
|
30
|
+
'rate_limit_remaining' => rate_limit_remaining,
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Clear the response so we throw an exception if we get called again
|
35
|
+
# without a new response being set.
|
36
|
+
#
|
37
|
+
def clearResponse()
|
38
|
+
@response = false
|
39
|
+
end
|
40
|
+
|
41
|
+
# Fake a call to a DataSift API endpoint.
|
42
|
+
# === Parameters
|
43
|
+
#
|
44
|
+
# * +endpoint+ - The endpoint of the API call.
|
45
|
+
# * +params+ - The parameters to be passed along with the request.
|
46
|
+
# * +username+ - The username for the Auth header
|
47
|
+
# * +api_key+ - The API key for the Auth header
|
48
|
+
def call(username, api_key, endpoint, params = {}, user_agent = 'DataSiftPHP/0.0')
|
49
|
+
if !@response
|
50
|
+
raise StandardError, 'Expected response not set in mock object'
|
51
|
+
end
|
52
|
+
@response
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#
|
2
|
+
# stream_consumer.rb - This file contains the StreamConsumer class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The StreamConsumer class is base class for various stream consumers.
|
9
|
+
|
10
|
+
module DataSift
|
11
|
+
|
12
|
+
# StreamConsumer class.
|
13
|
+
#
|
14
|
+
class StreamConsumer
|
15
|
+
TYPE_HTTP = 'HTTP'
|
16
|
+
|
17
|
+
STATE_STOPPED = 0
|
18
|
+
STATE_STARTING = 1
|
19
|
+
STATE_RUNNING = 2
|
20
|
+
STATE_STOPPING = 3
|
21
|
+
|
22
|
+
# Factory function. Creates a StreamConsumer-derived object for the given
|
23
|
+
# type.
|
24
|
+
# === Parameters
|
25
|
+
#
|
26
|
+
# * +type+ - Use the TYPE_ constants
|
27
|
+
# * +definition+ - CSDL string or a Definition object.
|
28
|
+
#
|
29
|
+
def self.factory(user, type, definition)
|
30
|
+
type ||= TYPE_HTTP
|
31
|
+
@klass = Module.const_get('DataSift').const_get('StreamConsumer_' + type)
|
32
|
+
@klass.new(user, definition)
|
33
|
+
end
|
34
|
+
|
35
|
+
attr_accessor :auto_reconnect
|
36
|
+
attr_reader :state, :stop_reason
|
37
|
+
|
38
|
+
# Constructor. Do not use this directly, use the factory method instead.
|
39
|
+
# === Parameters
|
40
|
+
#
|
41
|
+
# * +user+ - The user this consumer will run as.
|
42
|
+
# * +definition+ - CSDL string or a Definition object.
|
43
|
+
#
|
44
|
+
def initialize(user, definition)
|
45
|
+
raise InvalidDataError, 'Please supply a valid User object when creating a Definition object.' unless user.is_a? DataSift::User
|
46
|
+
|
47
|
+
if definition.is_a? String
|
48
|
+
@definition = user.createDefinition(definition)
|
49
|
+
elsif definition.is_a? Definition
|
50
|
+
@definition = definition
|
51
|
+
else
|
52
|
+
raise InvalidDataError, 'The definition must be a CSDL string or a DataSift_Definition object'
|
53
|
+
end
|
54
|
+
|
55
|
+
@user = user
|
56
|
+
@auto_reconnect = true
|
57
|
+
@stop_reason = 'Unknown reason'
|
58
|
+
@state = STATE_STOPPED
|
59
|
+
|
60
|
+
# Compile the definition to ensure it's valid for use
|
61
|
+
@definition.compile()
|
62
|
+
end
|
63
|
+
|
64
|
+
# This is called when the consumer is stopped.
|
65
|
+
# === Parameters
|
66
|
+
#
|
67
|
+
# * +reason+ - The reason why the consumer stopped.
|
68
|
+
#
|
69
|
+
def onStopped(&block)
|
70
|
+
if block_given?
|
71
|
+
@on_stopped = block
|
72
|
+
self
|
73
|
+
else
|
74
|
+
@on_stopped
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Once an instance of a StreamConsumer is ready for use, call this to
|
79
|
+
# start consuming. Extending classes should implement onStart to handle
|
80
|
+
# actually starting.
|
81
|
+
# === Parameters
|
82
|
+
#
|
83
|
+
# * +auto_reconnect+ - Whether the consumer should automatically reconnect.
|
84
|
+
# * +block+ - An optional block to receive incoming interactions.
|
85
|
+
#
|
86
|
+
def consume(auto_reconnect = true, &block)
|
87
|
+
@auto_reconnect = auto_reconnect;
|
88
|
+
|
89
|
+
# Start consuming
|
90
|
+
@state = STATE_STARTING
|
91
|
+
onStart(&block)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Called when the consumer should start consuming the stream.
|
95
|
+
#
|
96
|
+
def onStart()
|
97
|
+
puts 'onStart method has not been overridden!'
|
98
|
+
end
|
99
|
+
|
100
|
+
# This method can be called at any time to *request* that the consumer
|
101
|
+
# stop consuming. This method sets the state to STATE_STOPPING and it's
|
102
|
+
# up to the consumer implementation to notice that this has changed, stop
|
103
|
+
# consuming and call the onStopped method.
|
104
|
+
#
|
105
|
+
def stop()
|
106
|
+
raise InvalidDataError, 'Consumer state must be RUNNING before it can be stopped' unless @state = StreamConsumer::STATE_RUNNING
|
107
|
+
@state = StreamConsumer::STATE_STOPPING
|
108
|
+
end
|
109
|
+
|
110
|
+
# Default implementation of onStop. It's unlikely that this method will
|
111
|
+
# ever be used in isolation, but rather it should be called as the final
|
112
|
+
# step in the extending class's implementation.
|
113
|
+
# === Parameters
|
114
|
+
#
|
115
|
+
# * +reason+ - The reason why the consumer stopped.
|
116
|
+
#
|
117
|
+
def onStop(reason = '')
|
118
|
+
reason = 'Unexpected' unless @state != StreamConsumer::STATE_STOPPING and reason.length == 0
|
119
|
+
@state = StreamConsumer::STATE_STOPPED
|
120
|
+
@stop_reason = reason
|
121
|
+
onStopped.call(reason) unless onStopped.nil?
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
#
|
2
|
+
# stream_consumer_http.rb - This file contains the StreamConsumer_HTTP class.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2011 MediaSift Ltd
|
5
|
+
#
|
6
|
+
# == Overview
|
7
|
+
#
|
8
|
+
# The StreamConsumer_HTTP class implements HTTP streaming.
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../')
|
11
|
+
|
12
|
+
require 'uri'
|
13
|
+
require 'socket'
|
14
|
+
require 'yajl'
|
15
|
+
|
16
|
+
module DataSift
|
17
|
+
|
18
|
+
class StreamConsumer_HTTP < StreamConsumer
|
19
|
+
|
20
|
+
# Constructor. Requires valid user and definition objects.
|
21
|
+
def initialize(user, definition)
|
22
|
+
super
|
23
|
+
end
|
24
|
+
|
25
|
+
def onStart(&block)
|
26
|
+
begin
|
27
|
+
reconnect() unless !@socket.nil? and !@socket.closed?
|
28
|
+
|
29
|
+
parser = Yajl::Parser.new
|
30
|
+
parser.on_parse_complete = block if block_given?
|
31
|
+
if @response_head[:headers]["Transfer-Encoding"] == 'chunked'
|
32
|
+
if block_given?
|
33
|
+
chunkLeft = 0
|
34
|
+
while !@socket.eof? && (line = @socket.gets)
|
35
|
+
break if line.match /^0.*?\r\n/
|
36
|
+
next if line == "\r\n"
|
37
|
+
size = line.hex
|
38
|
+
json = @socket.read(size)
|
39
|
+
next if json.nil?
|
40
|
+
chunkLeft = size-json.size
|
41
|
+
if chunkLeft == 0
|
42
|
+
parser << json
|
43
|
+
else
|
44
|
+
# received only part of the chunk, grab the rest
|
45
|
+
parser << @socket.read(chunkLeft)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
else
|
49
|
+
raise StreamError, 'Chunked responses detected, but no block given to handle the chunks.'
|
50
|
+
end
|
51
|
+
else
|
52
|
+
content_type = @response_head[:headers]['Content-Type'].split(';')
|
53
|
+
content_type = content_type.first
|
54
|
+
if ALLOWED_MIME_TYPES.include?(content_type)
|
55
|
+
case @response_head[:headers]['Content-Encoding']
|
56
|
+
when 'gzip'
|
57
|
+
return Yajl::Gzip::StreamReader.parse(@socket, opts, &block)
|
58
|
+
when 'deflate'
|
59
|
+
return Yajl::Deflate::StreamReader.parse(@socket, opts.merge({:deflate_options => -Zlib::MAX_WBITS}), &block)
|
60
|
+
when 'bzip2'
|
61
|
+
return Yajl::Bzip2::StreamReader.parse(@socket, opts, &block)
|
62
|
+
else
|
63
|
+
return parser.parse(@socket)
|
64
|
+
end
|
65
|
+
else
|
66
|
+
raise StreamError, 'Unhandled response MIME type ' + content_type
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end while @auto_reconnect and @state == StreamConsumer::STATE_RUNNING
|
70
|
+
|
71
|
+
disconnect()
|
72
|
+
|
73
|
+
if @state == StreamConsumer::STATE_STOPPING
|
74
|
+
@stop_reason = 'Stop requested'
|
75
|
+
else
|
76
|
+
@stop_reason = 'Connection dropped'
|
77
|
+
end
|
78
|
+
|
79
|
+
onStop(@stop_reason)
|
80
|
+
end
|
81
|
+
|
82
|
+
def reconnect()
|
83
|
+
uri = URI.parse('http://' + User::STREAM_BASE_URL + @definition.hash +
|
84
|
+
'?username=' + CGI.escape(@user.username) + '&api_key=' + CGI.escape(@user.api_key))
|
85
|
+
|
86
|
+
user_agent = @user.getUserAgent()
|
87
|
+
|
88
|
+
request = "GET #{uri.path}#{uri.query ? "?"+uri.query : nil} HTTP/1.1\r\n"
|
89
|
+
request << "Host: #{uri.host}\r\n"
|
90
|
+
request << "User-Agent: #{user_agent}\r\n"
|
91
|
+
request << "Accept: */*\r\n"
|
92
|
+
request << "\r\n"
|
93
|
+
|
94
|
+
connection_delay = 0
|
95
|
+
|
96
|
+
begin
|
97
|
+
# Close the socket if it's open
|
98
|
+
disconnect()
|
99
|
+
|
100
|
+
# Back off a bit if required
|
101
|
+
sleep(connection_delay) if connection_delay > 0
|
102
|
+
|
103
|
+
begin
|
104
|
+
@socket = TCPSocket.new(uri.host, uri.port)
|
105
|
+
|
106
|
+
@socket.write(request)
|
107
|
+
@response_head = {}
|
108
|
+
@response_head[:headers] = {}
|
109
|
+
|
110
|
+
# Read the headers
|
111
|
+
@socket.each_line do |line|
|
112
|
+
if line == "\r\n" # end of the headers
|
113
|
+
break
|
114
|
+
else
|
115
|
+
header = line.split(": ")
|
116
|
+
if header.size == 1
|
117
|
+
header = header[0].split(" ")
|
118
|
+
@response_head[:version] = header[0]
|
119
|
+
@response_head[:code] = header[1].to_i
|
120
|
+
@response_head[:msg] = header[2]
|
121
|
+
else
|
122
|
+
@response_head[:headers][header[0]] = header[1].strip
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if @response_head[:code] == 200
|
128
|
+
# Success!
|
129
|
+
@state = StreamConsumer::STATE_RUNNING
|
130
|
+
elsif @response_head[:code] == 404
|
131
|
+
raise StreamError, 'Hash not found!'
|
132
|
+
else
|
133
|
+
puts 'Connection failed: ' + @response_head[:code] + ' ' + @response_head[:msg]
|
134
|
+
if connection_delay == 0
|
135
|
+
connection_delay = 10;
|
136
|
+
elsif connection_delay < 240
|
137
|
+
connection_delay *= 2;
|
138
|
+
else
|
139
|
+
raise StreamError, 'Connection failed: ' + @response_head[:code] + ' ' + @response_head[:msg]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
#rescue
|
143
|
+
# if connection_delay == 0
|
144
|
+
# connection_delay = 1
|
145
|
+
# elsif connection_delay <= 16
|
146
|
+
# connection_delay += 1
|
147
|
+
# else
|
148
|
+
# raise StreamError, 'Connection failed due to a network error'
|
149
|
+
# end
|
150
|
+
end
|
151
|
+
end while @state != StreamConsumer::STATE_RUNNING
|
152
|
+
end
|
153
|
+
|
154
|
+
def disconnect()
|
155
|
+
@socket.close if !@socket.nil? and !@socket.closed?
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|