embulk-filter-crawler 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4cb84672dc6d8f59c550c3faca25b051abe23209
4
- data.tar.gz: 35fa411ca6985bf8815ef79f40c32865cc11b109
3
+ metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
4
+ data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
5
5
  SHA512:
6
- metadata.gz: a165c5351f4d4d929ec98f3b4d539cba692fbf6ed8173994a396381d58a02754ba9f36d9e36625ad4e0f2f55bfc31c1e547549fca195e7e726ede28cd7d8758e
7
- data.tar.gz: c53e77f4f465f9da0d733a3b0db8b955e3b3c0c6ccf371a34f7487a4a98cf7052987f35974ada23824a1f72a31f1eebaa4351d9f32b54f9e4502e699e7625fba
6
+ metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
7
+ data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
data/README.md CHANGED
@@ -10,13 +10,14 @@ Write short description here and build.gradle file.
10
10
 
11
11
  - **target_key**: base_url column key name (string, require)
12
12
  - **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
13
- - **seed_size**: seed_size (string, default: `"myvalue"`)
14
- - **number_of_crawlers**: parallelism (string, default: 1)
15
- - **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
13
+ - **number_of_crawlers**: parallelism (integer, default: 1)
14
+ - **max_pages_to_fetch**: max_pages_to_fetch (integer, default: unlimited)
16
15
  - **crawl_storage_folder**: crawl_storage_folder (string, require)
17
16
  - **politeness_delay**: politeness_delay (integer, default: null)
18
17
  - **user_agent_string**: user_agent_string (string, default: null)
19
- - **keep_input**: keep_input (string, default: `"myvalue"`)
18
+ - **output_prefix**: output_prefix (string, default: "")
19
+ - **connection_timeout**: connection timeout millisecond (integer, default: 30000)
20
+ - **socket_timeout**: socket timeout millisecond (integer, default: 20000)
20
21
 
21
22
  ## Example
22
23
 
@@ -33,7 +34,6 @@ filters:
33
34
  - type: crawler
34
35
  target_key: url
35
36
  number_of_crawlers: 10
36
- seed_size: 100
37
37
  max_depth_of_crawling: 4
38
38
  politeness_delay: 100
39
39
  crawl_storage_folder: "/tmp/crawl/%s"
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.1.0"
17
+ version = "0.1.1"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -74,6 +74,14 @@ public class CrawlerFilterPlugin
74
74
  @Config("should_not_visit_pattern")
75
75
  @ConfigDefault("null")
76
76
  public Optional<String> getShouldNotVisitPattern();
77
+
78
+ @Config("connection_timeout")
79
+ @ConfigDefault("null")
80
+ public Optional<Integer> getConnectionTimeout();
81
+
82
+ @Config("socket_timeout")
83
+ @ConfigDefault("null")
84
+ public Optional<Integer> getSocketTimeout();
77
85
  }
78
86
 
79
87
  @Override
@@ -202,6 +210,12 @@ public class CrawlerFilterPlugin
202
210
  if (task.getUserAgentString().isPresent()) {
203
211
  config.setUserAgentString(task.getUserAgentString().get());
204
212
  }
213
+ if (task.getSocketTimeout().isPresent()) {
214
+ config.setSocketTimeout(task.getSocketTimeout().get());
215
+ }
216
+ if (task.getConnectionTimeout().isPresent()) {
217
+ config.setConnectionTimeout(task.getConnectionTimeout().get());
218
+ }
205
219
 
206
220
  PageFetcher pageFetcher = new PageFetcher(config);
207
221
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-25 00:00:00.000000000 Z
11
+ date: 2016-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -73,7 +73,7 @@ files:
73
73
  - classpath/commons-logging-1.2.jar
74
74
  - classpath/crawler4j-4.2.jar
75
75
  - classpath/dom4j-1.6.1.jar
76
- - classpath/embulk-filter-crawler-0.1.0.jar
76
+ - classpath/embulk-filter-crawler-0.1.1.jar
77
77
  - classpath/fontbox-1.8.4.jar
78
78
  - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
79
79
  - classpath/httpclient-4.4.jar