ruboty-ymcrawl 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ff618b303eaf36d9fb1702df229d4bef5a97b76b
4
- data.tar.gz: c2aaefedce57faf5432d349f15bcec566bc98a33
3
+ metadata.gz: f1fa7723d9cb543e8a2be4c47f7639f579798f33
4
+ data.tar.gz: 57d00ba352089fc46a5f337e525424b2eba68dc4
5
5
  SHA512:
6
- metadata.gz: 2ea55a004dc5af6c3d30c3eedf5856e3a2e270fddee36e6abd24cfede67a2541189d167dc916b1234394c947aee340d963860944e642ccb8bfa1109dcb9be664
7
- data.tar.gz: 27f054948dfdbfb4515b72242976b564f8104f29f785d488a0f8b622e26203563bac20b25f5e277a20f2c068cabe273e2050575f19b2ac018595c6b0c395c688
6
+ metadata.gz: 2a388ae8e594e16e25721f92a22dcf42d9845986b13d0880d1d9d7961edc69452a2ce88f0672ecf9a8122349cd0dd1acb7ad3088907c0702eda78863c24a16fb
7
+ data.tar.gz: c26de2ae35453342d9d9e63a83880df8ab4adb6d07e528728a845b2a60ee826dde6f2c69816f7e2652f8a6ac31621bf552d6c3a69fc6951f004776112c32d1fb
@@ -1,15 +1,88 @@
1
+ require_relative 'src/main'
2
+ require 'singleton'
3
+
1
4
  module Ruboty
2
5
  module Handlers
3
- class YMCrawl < Base
6
+
7
+ class CrawlManager
8
+ include Singleton
9
+ def initialize
10
+ @crawl = nil
11
+ end
12
+
13
+ def get_crawl
14
+ @crawl = YMCrawl::Core.new if @crawl == nil
15
+ @crawl
16
+ end
17
+ end
18
+
19
+ class Hello < Base
20
+ on /hello\z/i, name: "hello", description: "Return hello"
21
+
22
+ def hello(message)
23
+ message.reply("hello!!")
24
+ end
25
+ end
26
+
27
+ class Crawl < Base
4
28
  on(
5
- /crawl\z/,
6
- name: 'crawl',
7
- description: 'image crawler'
29
+ /crawl ?(?<url>.+)?\z/i,
30
+ name: "crawl",
31
+ description: "crawl image"
8
32
  )
9
33
 
10
- def analyze(message)
11
- Ruboty::YMCrawl::Actions::Analyze.new(message).call
34
+ def get_access_token_message(url)
35
+ return "You don't have access token.
36
+ 1. Go to: #{url}
37
+ 2. Click \"Allow\" (you might have to log in first).
38
+ 3. reply to bot as \"@bot dropbox:auth (auth_code) \""
39
+ end
40
+
41
+ def crawl(message)
42
+ url = (message[:url] == nil) ? "-- please set url --" : message[:url]
43
+ begin
44
+ crawl = CrawlManager.instance.get_crawl
45
+ uploader = crawl.get_uploader
46
+
47
+ # upload先がlocal以外かつアクセストークンが取得されていない場合は、取得先URLを示して終了
48
+ if not uploader.access_token? and uploader.get_name != "local"
49
+ message.reply( get_access_token_message( uploader.get_access_token_url ) )
50
+ return nil
51
+ end
52
+
53
+ message.reply("rubot is crawling from #{url}")
54
+ zip_paths = crawl.start([url])
55
+ message.reply("get zip file => #{zip_paths}")
56
+ rescue URI::InvalidURIError => ex
57
+ puts ex
58
+ message.reply("URL is invalid. please retry.")
59
+ rescue => ex
60
+ puts "error raise in Crawl.crawl"
61
+ puts ex
62
+ message.reply("Sorry, error occurred.")
63
+ message.reply("Please feedback this error to niboshiporipori@gmail.com")
64
+ message.reply(ex)
65
+ end
12
66
  end
13
67
  end
68
+
69
+ class VerifyAuthCode < Base
70
+ on(
71
+ /dropbox:auth ?(?<auth_code>.+)?\z/i,
72
+ name: "verify_auth_code",
73
+ description: "add access token by auth code"
74
+ )
75
+
76
+ def verify_auth_code(message)
77
+ auth_code = (message[:auth_code] == nil) ? "-- please set auth_code --" : message[:auth_code]
78
+ uploader = CrawlManager.instance.get_crawl.get_uploader
79
+ access_token = uploader.verify_auth_code(auth_code)
80
+ YMCrawl::DataManager.instance.update_access_token(uploader.get_name, access_token)
81
+
82
+ message.reply("You added access token!")
83
+ message.reply("Try clawling again!")
84
+ end
85
+ end
86
+
14
87
  end
15
88
  end
@@ -0,0 +1,208 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'kconv'
4
+ require 'addressable/uri'
5
+ require 'singleton'
6
+
7
+ module YMCrawl
8
+ # URLに関する処理をまとめたクラス
9
+ class URLUtil
10
+ def self.normalize_url(url)
11
+ puts "---- URL is null in normalize_url!!!!!!!!!!!!! ----" if url == nil
12
+ Addressable::URI.parse(url).normalize.to_s
13
+ end
14
+ end
15
+
16
+ # CSSセレクタを表すクラス
17
+ class Selector
18
+ def initialize(css)
19
+ @selector = css
20
+ end
21
+
22
+ def to_s ;@selector end
23
+
24
+ # セレクタの一番最後のタグが何かを返す。擬似クラスなどは取り除く
25
+ def get_last_tag
26
+ # 一番最後の要素だけを返す。(擬似クラスなどは省く)
27
+ @selector.split(/\s|\+|>/).last.split(/:|,|\[|\.|#/).first
28
+ end
29
+ end
30
+
31
+ # ホストごとの処理を管理するクラス
32
+ class HostManager
33
+ include Singleton
34
+ DEFAULT_WAIT_TIME = 2
35
+ def initialize
36
+ @host_list = {}
37
+ @wait_time = DEFAULT_WAIT_TIME
38
+ end
39
+
40
+ def set_wait_time(wait_time) @wait_time = wait_time end
41
+
42
+ # 最後にアクセスした日時を取得する
43
+ def wait(url)
44
+ host = URI( URLUtil.normalize_url(url) ).host
45
+ unless @host_list[host] == nil then
46
+ time_diff = Time.now - @host_list[host]
47
+ puts "sleep: #{sleep(@wait_time - time_diff)}sec." if time_diff < @wait_time
48
+ end
49
+ @host_list[host] = Time.now
50
+ end
51
+ end
52
+
53
+ # あるURLから取得できるHTMLドキュメントを抽象化したクラス
54
+ class Page
55
+ class PageError < StandardError; end
56
+ def initialize(url)
57
+ @url = url
58
+ @doc = get_doc
59
+ end
60
+
61
+ # 指定したcssセレクタに合致する要素を表すクラスの配列を返す
62
+ def search_elements(selector) @doc.css(selector).map{ |doc| Element.new(doc) } end
63
+
64
+ private
65
+ # 与えられたURLをパースして返す
66
+ def get_doc
67
+ puts "get_doc from #{@url}"
68
+ HostManager.instance.wait(@url)
69
+ html = open(URLUtil.normalize_url(@url), "r:binary").read
70
+ Nokogiri::HTML(html.toutf8, nil, 'utf-8')
71
+ rescue OpenURI::HTTPError => ex
72
+ puts "failed URL: #{@url}"
73
+ puts "HTTP Error message: #{ex.message}"
74
+ raise PageError.new(ex.message)
75
+ end
76
+ end
77
+
78
+ # セレクタにより抽出されたPageの一部を表すクラス
79
+ class Element
80
+ def initialize(doc) @doc = doc end
81
+
82
+ def get_url; @doc["href"] end
83
+
84
+ # 画像へのURLを返す
85
+ def get_image_url
86
+ return @doc["href"] if @doc.name == "a"
87
+ return @doc["src"] if @doc.name == "img"
88
+ raise ArgumentError, "in Element"
89
+ end
90
+
91
+ # 画像のタイトルを返す
92
+ def get_image_title
93
+ title = (@doc.name == "img") ? @doc["title"] : @doc.content
94
+ (title == nil) ? "noname" : title
95
+ end
96
+
97
+ # 記事タイトルを返す
98
+ def get_title; @doc.content end
99
+
100
+ # 記事が何ページまであるかを返す
101
+ def get_page_index_max; @doc.content.to_i end
102
+
103
+ # 対象に応じてURLを返す
104
+ def get_content(target)
105
+ return get_url if target == :url
106
+ return get_image_url if target == :image
107
+ return get_image_title if target == :image_title
108
+ return get_title if target == :title
109
+ return get_page_index_max if target == :page_index_max
110
+ end
111
+ end
112
+
113
+ # 画像のスクレイピングを行うクラス
114
+ class Crawler
115
+ INDEX_STR = "{index}" # jsonファイルでINDEX番号が入る場所を表す文字列
116
+
117
+ def initialize(dir, site_data, wait_time)
118
+ HostManager.instance.set_wait_time(wait_time)
119
+ @selectors = {}
120
+ @selectors[:image] = site_data["css"]["image"].map { |s| Selector.new(s) }
121
+ @selectors[:image_title] = site_data["css"]["image_title"].map { |s| Selector.new(s) }
122
+ @selectors[:title] = site_data["css"]["title"].map { |s| Selector.new(s) }
123
+ @selectors[:page_index_max] = site_data["css"]["page_index_max"].map { |s| Selector.new(s) }
124
+ @page_index_min = site_data["page_index_min"]
125
+ @next_page_appendix = (site_data["next_page_appendix"] == nil) ? "" : site_data["next_page_appendix"]
126
+ @dir = dir
127
+ end
128
+
129
+ # 与えられたcssセレクタから画像を抽出する
130
+ def save_images(original_url)
131
+ dst_dir = "#{@dir}/#{get_contents(original_url, :title).first}"
132
+ (@page_index_min..get_page_index_max(original_url) ).each do |page_index|
133
+ url = "#{original_url}#{get_next_page_appendix_with_index(page_index)}"
134
+ get_contents(url, :image).zip(get_contents(url, :image_title)) do |url, title|
135
+ save_image(dst_dir, url, title) unless url == nil
136
+ end
137
+ end
138
+ dst_dir
139
+ end
140
+
141
+ private
142
+ # ファイル名が既にimgディレクトリに存在していた場合はインデックスを付与する
143
+ def get_unique_name(dir, org_name)
144
+ basename = (org_name == nil) ? "noname" : File.basename(org_name, '.*')
145
+ ext = File.extname(org_name)
146
+ return "#{basename}#{ext}" unless FileTest.exist?("#{dir}/#{basename}#{ext}")
147
+ index = 1
148
+ retname = "#{basename}#{index}#{ext}"
149
+ while FileTest.exist?("#{dir}/#{retname}") do
150
+ index = index + 1
151
+ retname = "#{basename}#{index}#{ext}"
152
+ end
153
+ return retname
154
+ end
155
+
156
+ # 指定されたリンク先の画像を保存する
157
+ def save_image(dst_dir, url, title)
158
+ puts "src: #{url}"
159
+ # ready filepath
160
+ filename = "#{title}#{File.extname(url)}"
161
+ filePath = "#{dst_dir}/#{get_unique_name(dst_dir, filename)}"
162
+ HostManager.instance.wait(url)
163
+ # fileName folder if not exist
164
+ FileUtils.mkdir_p(dst_dir) unless FileTest.exist?(dst_dir)
165
+ # write image adata
166
+ begin
167
+ open(filePath, 'wb') do |output|
168
+ puts "dst: #{filePath}"
169
+ open(URLUtil.normalize_url(url)) do |data|
170
+ output.write(data.read)
171
+ end
172
+ end
173
+ rescue # ファイルが存在しないなどの理由で例外が発生した場合は、生成した画像を削除
174
+ puts "image not exist."
175
+ File.delete filePath
176
+ end
177
+ end
178
+
179
+ # URLに付加する文字列を返す
180
+ def get_next_page_appendix_with_index(index)
181
+ return "" if @next_page_appendix == ""
182
+ @next_page_appendix.gsub("{index}", index.to_s)
183
+ end
184
+
185
+ # 記事の最大ページを取得する
186
+ def get_page_index_max(url)
187
+ # page_index_maxのcssが空文字だとget_contentsがエラーになるので、最初にチェック
188
+ return @page_index_min if @next_page_appendix == ""
189
+ page_index_max = get_contents(url, :page_index_max)
190
+ return @page_index_min if page_index_max.length == 0
191
+ (page_index_max.first.kind_of?(Integer)) ? page_index_max.first : @page_index_min
192
+ end
193
+
194
+ # 与えられたURLから、セレクタに従って画像のURLを返す
195
+ def get_contents(url, target, nest = 0)
196
+ selector = @selectors[target][nest]
197
+ if nest >= (@selectors[target].length - 1)
198
+ return Page.new(url).search_elements(selector.to_s).map{ |cn| cn.get_content(target) }
199
+ end
200
+ # 得られたURLそれぞれに対して次のセレクタを実行する
201
+ contents = Page.new(url).search_elements(selector.to_s).map{ |cn| cn.get_content(:url) }
202
+ contents.map{ |c| get_contents(c, target, nest + 1) }.flatten
203
+ rescue Page::PageError => ex
204
+ puts "error in get_contents #{ex}"
205
+ return nil
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,60 @@
1
+ # Install this the SDK with "gem install dropbox-sdk"
2
+ require 'dropbox_sdk'
3
+ module YMCrawl
4
+ class DropboxManager
5
+
6
+ def initialize(app_key, app_sec)
7
+ @app_key = app_key
8
+ @app_sec = app_sec
9
+ @client = nil
10
+ @access_token = nil
11
+ end
12
+
13
+ def login(arg_access_token = nil)
14
+ if not @client.nil?
15
+ puts "already logged in!"
16
+ return @access_token
17
+ end
18
+
19
+ @access_token = arg_access_token
20
+ begin
21
+ @client = DropboxClient.new(@access_token)
22
+ puts "account info: #{@client.account_info()}"
23
+ return @access_token
24
+ rescue DropboxError => ex
25
+ puts "---- access token is invalid ----"
26
+ return nil
27
+ end
28
+ end
29
+
30
+ def get_auth_code_url
31
+ puts "web_auth is nil!!!!" if @web_auth == nil
32
+ @web_auth = DropboxOAuth2FlowNoRedirect.new(@app_key, @app_sec)
33
+ authorize_url = @web_auth.start()
34
+ end
35
+
36
+ def get_access_token(auth_code)
37
+ @web_auth.finish(auth_code)[0]
38
+ end
39
+
40
+ def put(command)
41
+ fname = command[0]
42
+
43
+ #If the user didn't specifiy the file name, just use the name of the file on disk
44
+ if command[1]
45
+ new_name = command[1]
46
+ else
47
+ new_name = File.basename(fname)
48
+ end
49
+
50
+ if fname && !fname.empty? && File.exists?(fname) && (File.ftype(fname) == 'file') && File.stat(fname).readable?
51
+ #This is where we call the the Dropbox Client
52
+ pp @client.put_file(new_name, open(fname))
53
+ else
54
+ puts "couldn't find the file #{ fname }"
55
+ end
56
+ end
57
+
58
+ def get_share_link(path) @client.shares(path) end
59
+ end
60
+ end
@@ -0,0 +1,161 @@
1
+ require_relative 'crawler'
2
+ require_relative 'dropbox.rb'
3
+ require 'optparse'
4
+ require 'json'
5
+ require 'zipruby'
6
+ require 'find'
7
+ require 'kconv'
8
+ require 'json-schema'
9
+
10
+ module YMCrawl
11
+ ORG_SETTING_FILE_PATH = "YMCrawlfile"
12
+ SETTING_FILE_PATH = "#{ORG_SETTING_FILE_PATH}"
13
+ SCHEMA_FILE_PATH = "YMCrawl_schema.json"
14
+ UPLOADER_SCHEMA_FILE_PATH = "uploader_schema.json"
15
+ SITE_JSON_PATH = "site.json"
16
+
17
+ class DataManager
18
+
19
+ include Singleton
20
+
21
+ def initialize
22
+ @setting = JSON.parse( File.open(SETTING_FILE_PATH).read)
23
+ puts "YMCrawlfile valid"
24
+ puts JSON::Validator.fully_validate(SCHEMA_FILE_PATH, @setting, :insert_defaults => true).to_s
25
+ @sites = get_sites_json(SITE_JSON_PATH)
26
+ File.write( SITE_JSON_PATH, JSON.unparse(@sites) ) unless FileTest.exist?(SITE_JSON_PATH)
27
+ puts "uploader valid"
28
+ puts JSON::Validator.fully_validate(UPLOADER_SCHEMA_FILE_PATH, get_uploader_data, :insert_defaults => true).to_s
29
+ end
30
+
31
+ # 各サイトごとの、画像取得のためのcssセレクタを記載したjsonをファイルから取得して返す
32
+ def get_sites_json(path)
33
+ path = FileTest.exist?(path) ? path : @setting["site_json"]
34
+ puts "reading site json file from #{path}"
35
+ JSON.parse( open(path).read)
36
+ end
37
+
38
+ # URLのドメインに合致するsite情報を返す
39
+ def get_current_uploder_info(url)
40
+ host = URI(url).host
41
+ # ハッシュのkeyがs[0],valueがs[1]に入る
42
+ @sites.each{ |s| return s[1] if s[1]["host"] == host }
43
+ return @sites["default"]
44
+ end
45
+
46
+ def update_access_token(uploader_name, access_token)
47
+ @setting["uploader"][uploader_name]["access_token"] = access_token if @setting["uploader"][uploader_name] != access_token
48
+ puts "setting: #{@setting}"
49
+ open(SETTING_FILE_PATH, 'w') do |io|
50
+ JSON.dump(@setting, io)
51
+ end
52
+ end
53
+
54
+ def get_setting; @setting end
55
+ def get_save_to; @setting["save_to"] end
56
+ def get_uploader_data; @setting["uploader"][get_save_to] end
57
+ def get_current_access_token; get_uploader_data["access_token"] end
58
+ def get_current_app_key; ENV["#{@setting["save_to"].upcase }_APP_KEY"] end
59
+ def get_current_app_secret; ENV["#{@setting["save_to"].upcase }_APP_SECRET"] end
60
+ end
61
+
62
+ class Core
63
+ def initialize
64
+ @data = DataManager.instance
65
+ if @data.get_save_to != "local"
66
+ @uploader = Uploader.new(@data.get_save_to, @data.get_current_app_key, @data.get_current_app_secret, @data.get_current_access_token)
67
+ end
68
+ end
69
+
70
+ def start(urls); upload crawl(urls) end
71
+
72
+ # 画像をクロールして保存する。保存したファイルのパスを返す。
73
+ def crawl(urls)
74
+ ncrawler = Crawler.new(@data.get_setting["dst_dir"], @data.get_current_uploder_info(urls[0]), @data.get_setting["wait_time"])
75
+ urls.map{ |v| ncrawler.save_images(v) }
76
+ end
77
+
78
+ # 画像を指定した先へアップロード
79
+ def upload(file_dirs)
80
+ setting = @data.get_setting
81
+ return nil if @data.get_save_to == "local"
82
+
83
+ @uploader.login(@data.get_current_access_token)
84
+ zip_paths = file_dirs.map{ |dir| zip_dir(dir) }
85
+ encode = (ENV["LANG"] == nil) ? "utf-8" : ENV["LANG"]
86
+ begin
87
+ file_dirs.each{ |dir| FileUtils::remove_entry_secure( dir.force_encoding(encode) ) }
88
+ rescue
89
+ if encode != "ascii-8bit"
90
+ encode = "ascii-8bit"
91
+ retry
92
+ end
93
+ end
94
+ share_paths = []
95
+ zip_paths.each do |path|
96
+ puts "uploading #{path} to dropbox"
97
+ put_result = @uploader.put([path])
98
+ File::delete(path)
99
+ share_paths << @uploader.get_share_link(put_result["path"])["url"]
100
+ end
101
+ return share_paths
102
+ end
103
+
104
+ # 指定されたディレクトリ以下のファイルをzipにする。返り値はzipのパス
105
+ def zip_dir(src)
106
+ dst = "#{src}.zip"
107
+ Zip::Archive.open(dst, Zip::CREATE) do |ar|
108
+ Dir.glob("#{src}/*").each do |item|
109
+ ar.add_file(item)
110
+ end
111
+ end
112
+ dst
113
+ end
114
+
115
+ def get_uploader; @uploader end
116
+ end
117
+
118
+ # ファイルをアップロードする先を抽象化したクラス
119
+ class Uploader
120
+ def initialize(name, app_key, app_secret, access_token = nil)
121
+ @name = name
122
+ @app_key = app_key
123
+ @app_secret = app_secret
124
+ @access_token = access_token
125
+ @c_uploader = create_uploader
126
+ end
127
+
128
+ # 引数に応じてアップロード先のインスタンスを返す
129
+ def create_uploader
130
+ return @c_uploader unless @c_uploader == nil
131
+ if @name == "dropbox"
132
+ @c_uploader = DropboxManager.new(@app_key, @app_secret)
133
+ return @c_uploader
134
+ end
135
+ raise ArgumentError("uploader #{@name} is not found")
136
+ end
137
+
138
+ def access_token?; @access_token != "" and @access_token != nil end
139
+
140
+ def verify_auth_code(auth_code)
141
+ @access_token = @c_uploader.get_access_token(auth_code)
142
+ end
143
+
144
+ def login(token = nil)
145
+ @access_token = (token == nil) ? @access_token : token
146
+ puts "access token: #{@access_token}"
147
+ puts "---- access token isn't set when login!!!! ----" if token ==nil
148
+ @c_uploader.login(token)
149
+ end
150
+
151
+ def get_access_token_url
152
+ error = "---- YMCrawl publishing new access token url. But you already have access token. ----"
153
+ puts error if @access_token != nil and @access_token != ""
154
+ @c_uploader.get_auth_code_url
155
+ end
156
+
157
+ def get_name; @name end
158
+ def put(command) @c_uploader.put(command) end
159
+ def get_share_link(path) @c_uploader.get_share_link(path) end
160
+ end
161
+ end
@@ -1,5 +1,5 @@
1
1
  module Ruboty
2
2
  module Ymcrawl
3
- VERSION = "0.0.1"
3
+ VERSION = "0.0.3"
4
4
  end
5
5
  end
@@ -1,9 +1,6 @@
1
+ require "ruboty/handlers/ymcrawl"
2
+ require "ruboty/ymcrawl/crawler"
3
+ require "ruboty/ymcrawl/dropbox"
4
+ require "ruboty/ymcrawl/main"
1
5
  require "ruboty/ymcrawl/version"
2
6
 
3
- module Ruboty
4
- module Ymcrawl
5
- def self.hoge
6
- puts "hello from ymcrawl hoge"
7
- end
8
- end
9
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruboty-ymcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - mpk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-09 00:00:00.000000000 Z
11
+ date: 2014-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -159,6 +159,9 @@ files:
159
159
  - Rakefile
160
160
  - lib/ruboty/handlers/ymcrawl.rb
161
161
  - lib/ruboty/ymcrawl.rb
162
+ - lib/ruboty/ymcrawl/crawler.rb
163
+ - lib/ruboty/ymcrawl/dropbox.rb
164
+ - lib/ruboty/ymcrawl/main.rb
162
165
  - lib/ruboty/ymcrawl/version.rb
163
166
  - ruboty-ymcrawl.gemspec
164
167
  - spec/ruboty/ymcrawl_spec.rb