embulk-input-twitterarchive 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a29d2bbc465d3f797cc20e0edafc783e5d395c42
4
+ data.tar.gz: 00ea0f9c339f4e9509d2efa7bd3511594a804733
5
+ SHA512:
6
+ metadata.gz: a1de1efcd5f17097af462c92977cdaf818dfad2922cfea4bf183508a00ac00b6b035e573c778946eb6c0f6193fffc87324df6719b58b3e98783cc07dbdecd9d2
7
+ data.tar.gz: eb836861de9cac44f2179471e7de6a433dec3b77eca2b7a32028ea743d833a3eb49050e7f5ea084bc3a330a51f20c25e6750434aea7c8415e4af55794dc483f5
@@ -0,0 +1,4 @@
1
+ /.bundle/
2
+ /Gemfile.lock
3
+ /pkg/
4
+ /.ruby-version
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,32 @@
1
+ # Twitterarchive input plugin for Embulk
2
+
3
+ TODO: Write short description here and embulk-input-twitterarchive.gemspec file.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: input
8
+ * **Resume supported**: yes
9
+ * **Cleanup supported**: yes
10
+ * **Guess supported**: no
11
+
12
+ ## Configuration
13
+
14
+ - **option1**: description (integer, required)
15
+ - **option2**: description (string, default: `"myvalue"`)
16
+ - **option3**: description (string, default: `null`)
17
+
18
+ ## Example
19
+
20
+ ```yaml
21
+ in:
22
+ type: twitterarchive
23
+ option1: example1
24
+ option2: example2
25
+ ```
26
+
27
+
28
+ ## Build
29
+
30
+ ```
31
+ $ rake
32
+ ```
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,18 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-input-twitterarchive"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["takuya sato"]
6
+ spec.summary = "Twitter Archive input plugin for Embulk"
7
+ spec.description = "Loads records from Twitterarchive."
8
+ spec.email = ["takuya0219@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/nazo/embulk-input-twitterarchive"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'bundler', ['~> 1.0']
17
+ spec.add_development_dependency 'rake', ['>= 10.0']
18
+ end
@@ -0,0 +1,64 @@
1
+ require 'json'
2
+ require 'time'
3
+
4
+ module Embulk
5
+ module Input
6
+
7
+ class TwitterarchiveInputPlugin < InputPlugin
8
+ # input plugin file name must be: embulk/input/<name>.rb
9
+ Plugin.register_input('twitterarchive', self)
10
+
11
+ def self.transaction(config, &control)
12
+ directory = config.param('directory', :string, default: nil)
13
+ index_file = File.read(File.join(directory, 'data/js/tweet_index.js'))
14
+ files = []
15
+ JSON.parse(index_file.gsub(/var tweet_index = /, '')).each do |file_meta|
16
+ files.push(file_meta['file_name'])
17
+ end
18
+ task = {
19
+ 'files' => files,
20
+ 'directory' => directory
21
+ }
22
+
23
+ columns = [
24
+ Column.new(0, 'id', :long),
25
+ Column.new(1, 'text', :string),
26
+ Column.new(2, 'source', :string),
27
+ Column.new(3, 'in_reply_to_status_id', :long),
28
+ Column.new(4, 'created_at', :timestamp),
29
+ ]
30
+
31
+ resume(task, columns, files.length, &control)
32
+ end
33
+
34
+ def self.resume(task, columns, count, &control)
35
+ puts "Twitter Archive input started."
36
+ commit_reports = yield(task, columns, count)
37
+ puts "Twitter Archive input finished. Commit reports = #{commit_reports.to_json}"
38
+
39
+ next_config_diff = {}
40
+ return next_config_diff
41
+ end
42
+
43
+ def initialize(task, schema, index, page_builder)
44
+ super
45
+ @file = task['files'][index]
46
+ @directory = task['directory']
47
+ end
48
+
49
+ def run
50
+ puts "Twitter Archive input thread #{@index}..."
51
+
52
+ tweet_file = File.read(File.join(@directory, @file))
53
+ JSON.parse(tweet_file.gsub(/Grailbird\.data\.tweets_[0-9]+_[0-9]+ = /, '')).each do |tweet|
54
+ @page_builder.add([tweet['id'], tweet['text'], tweet['source'], tweet['in_reply_to_status_id'], Time.parse(tweet['created_at'])])
55
+ end
56
+ @page_builder.finish # don't forget to call finish :-)
57
+
58
+ commit_report = {}
59
+ return commit_report
60
+ end
61
+ end
62
+
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-twitterarchive
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - takuya sato
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Loads records from Twitterarchive.
42
+ email:
43
+ - takuya0219@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - embulk-input-twitterarchive.gemspec
54
+ - lib/embulk/input/twitterarchive.rb
55
+ homepage: https://github.com/nazo/embulk-input-twitterarchive
56
+ licenses:
57
+ - MIT
58
+ metadata: {}
59
+ post_install_message:
60
+ rdoc_options: []
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements: []
74
+ rubyforge_project:
75
+ rubygems_version: 2.4.5
76
+ signing_key:
77
+ specification_version: 4
78
+ summary: Twitter Archive input plugin for Embulk
79
+ test_files: []