embulk-filter-crawler 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
4
- data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
3
+ metadata.gz: 52c2e67425310d86c6695fecbd4eeec2e80bc1f7
4
+ data.tar.gz: 12615ac3b93eb0eef4dd7c057f2508b070770611
5
5
  SHA512:
6
- metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
7
- data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
6
+ metadata.gz: 691bdc10a98944c09892392eab8187d14209cc00724d235c4fdab2d5d66f202254133da858a23a11371142941eb4bb060dda46960a529d5fb6e1091e4ef2ba9f
7
+ data.tar.gz: 4c5d5da29cbda4ba32f2450aeea928f3779179634565450214494d9ca9a29de41f63f1df536e15cac6f4ce98c3c1e32242396c2d13df740b00628d8f8b30219a
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.1.1"
17
+ version = "0.1.2"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -19,6 +19,7 @@ import org.embulk.spi.PageOutput;
19
19
  import org.embulk.spi.PageReader;
20
20
  import org.embulk.spi.Schema;
21
21
  import org.embulk.spi.type.Types;
22
+ import org.slf4j.Logger;
22
23
 
23
24
  import com.google.common.base.Optional;
24
25
  import com.google.common.collect.ImmutableList;
@@ -34,6 +35,8 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
34
35
  public class CrawlerFilterPlugin
35
36
  implements FilterPlugin
36
37
  {
38
+ private static final Logger logger = Exec.getLogger(CrawlerFilterPlugin.class);
39
+
37
40
  public interface PluginTask
38
41
  extends Task
39
42
  {
@@ -131,21 +134,10 @@ public class CrawlerFilterPlugin
131
134
  return new PageOutput() {
132
135
  private PageReader reader = new PageReader(inputSchema);
133
136
  private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
134
- private CrawlController controller = getController();
135
137
 
136
138
  @Override
137
139
  public void finish()
138
140
  {
139
- for (Object object : controller.getCrawlersLocalData()) {
140
- CrawlStat crawlStat = (CrawlStat) object;
141
- for (Map<String, Object> map : crawlStat.getPages()) {
142
- for (Column outputColumn : outputSchema.getColumns()) {
143
- final Object value = map.get(outputColumn.getName());
144
- setValue(value, outputColumn);
145
- }
146
- builder.addRecord();
147
- }
148
- }
149
141
  builder.finish();
150
142
  }
151
143
 
@@ -158,6 +150,7 @@ public class CrawlerFilterPlugin
158
150
  @Override
159
151
  public void add(Page page)
160
152
  {
153
+ CrawlController controller = getController();
161
154
  reader.setPage(page);
162
155
  while (reader.nextRecord()) {
163
156
  controller.addSeed(reader.getString(keyNameColumn));
@@ -169,6 +162,16 @@ public class CrawlerFilterPlugin
169
162
  }
170
163
  controller.setCustomData(customData);
171
164
  controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
165
+ for (Object object : controller.getCrawlersLocalData()) {
166
+ CrawlStat crawlStat = (CrawlStat) object;
167
+ for (Map<String, Object> map : crawlStat.getPages()) {
168
+ for (Column outputColumn : outputSchema.getColumns()) {
169
+ final Object value = map.get(outputColumn.getName());
170
+ setValue(value, outputColumn);
171
+ }
172
+ builder.addRecord();
173
+ }
174
+ }
172
175
  }
173
176
 
174
177
  /**
@@ -91,7 +91,7 @@ public class EmbulkCrawler extends WebCrawler
91
91
  BinaryParseData binaryParseData = (BinaryParseData) parseData;
92
92
  map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
93
93
  }
94
- logger.info("{}", webURL.getURL());
94
+ logger.debug("{}", webURL.getURL());
95
95
  myCrawlStat.pages.add(map);
96
96
  }
97
97
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-26 00:00:00.000000000 Z
11
+ date: 2016-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -73,7 +73,7 @@ files:
73
73
  - classpath/commons-logging-1.2.jar
74
74
  - classpath/crawler4j-4.2.jar
75
75
  - classpath/dom4j-1.6.1.jar
76
- - classpath/embulk-filter-crawler-0.1.1.jar
76
+ - classpath/embulk-filter-crawler-0.1.2.jar
77
77
  - classpath/fontbox-1.8.4.jar
78
78
  - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
79
79
  - classpath/httpclient-4.4.jar