embulk-filter-crawler 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4cb84672dc6d8f59c550c3faca25b051abe23209
4
- data.tar.gz: 35fa411ca6985bf8815ef79f40c32865cc11b109
3
+ metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
4
+ data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
5
5
  SHA512:
6
- metadata.gz: a165c5351f4d4d929ec98f3b4d539cba692fbf6ed8173994a396381d58a02754ba9f36d9e36625ad4e0f2f55bfc31c1e547549fca195e7e726ede28cd7d8758e
7
- data.tar.gz: c53e77f4f465f9da0d733a3b0db8b955e3b3c0c6ccf371a34f7487a4a98cf7052987f35974ada23824a1f72a31f1eebaa4351d9f32b54f9e4502e699e7625fba
6
+ metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
7
+ data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
data/README.md CHANGED
@@ -10,13 +10,14 @@ Write short description here and build.gradle file.
10
10
 
11
11
  - **target_key**: base_url column key name (string, require)
12
12
  - **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
13
- - **seed_size**: seed_size (string, default: `"myvalue"`)
14
- - **number_of_crawlers**: parallelism (string, default: 1)
15
- - **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
13
+ - **number_of_crawlers**: parallelism (integer, default: 1)
14
+ - **max_pages_to_fetch**: max_pages_to_fetch (integer, default: unlimited)
16
15
  - **crawl_storage_folder**: crawl_storage_folder (string, require)
17
16
  - **politeness_delay**: politeness_delay (integer, default: null)
18
17
  - **user_agent_string**: user_agent_string (string, default: null)
19
- - **keep_input**: keep_input (string, default: `"myvalue"`)
18
+ - **output_prefix**: output_prefix (string, default: "")
19
+ - **connection_timeout**: connection timeout millisecond (integer, default: 30000)
20
+ - **socket_timeout**: socket timeout millisecond (integer, default: 20000)
20
21
 
21
22
  ## Example
22
23
 
@@ -33,7 +34,6 @@ filters:
33
34
  - type: crawler
34
35
  target_key: url
35
36
  number_of_crawlers: 10
36
- seed_size: 100
37
37
  max_depth_of_crawling: 4
38
38
  politeness_delay: 100
39
39
  crawl_storage_folder: "/tmp/crawl/%s"
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.1.0"
17
+ version = "0.1.1"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -74,6 +74,14 @@ public class CrawlerFilterPlugin
74
74
  @Config("should_not_visit_pattern")
75
75
  @ConfigDefault("null")
76
76
  public Optional<String> getShouldNotVisitPattern();
77
+
78
+ @Config("connection_timeout")
79
+ @ConfigDefault("null")
80
+ public Optional<Integer> getConnectionTimeout();
81
+
82
+ @Config("socket_timeout")
83
+ @ConfigDefault("null")
84
+ public Optional<Integer> getSocketTimeout();
77
85
  }
78
86
 
79
87
  @Override
@@ -202,6 +210,12 @@ public class CrawlerFilterPlugin
202
210
  if (task.getUserAgentString().isPresent()) {
203
211
  config.setUserAgentString(task.getUserAgentString().get());
204
212
  }
213
+ if (task.getSocketTimeout().isPresent()) {
214
+ config.setSocketTimeout(task.getSocketTimeout().get());
215
+ }
216
+ if (task.getConnectionTimeout().isPresent()) {
217
+ config.setConnectionTimeout(task.getConnectionTimeout().get());
218
+ }
205
219
 
206
220
  PageFetcher pageFetcher = new PageFetcher(config);
207
221
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-25 00:00:00.000000000 Z
11
+ date: 2016-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -73,7 +73,7 @@ files:
73
73
  - classpath/commons-logging-1.2.jar
74
74
  - classpath/crawler4j-4.2.jar
75
75
  - classpath/dom4j-1.6.1.jar
76
- - classpath/embulk-filter-crawler-0.1.0.jar
76
+ - classpath/embulk-filter-crawler-0.1.1.jar
77
77
  - classpath/fontbox-1.8.4.jar
78
78
  - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
79
79
  - classpath/httpclient-4.4.jar