embulk-filter-crawler 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
4
- data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
3
+ metadata.gz: 52c2e67425310d86c6695fecbd4eeec2e80bc1f7
4
+ data.tar.gz: 12615ac3b93eb0eef4dd7c057f2508b070770611
5
5
  SHA512:
6
- metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
7
- data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
6
+ metadata.gz: 691bdc10a98944c09892392eab8187d14209cc00724d235c4fdab2d5d66f202254133da858a23a11371142941eb4bb060dda46960a529d5fb6e1091e4ef2ba9f
7
+ data.tar.gz: 4c5d5da29cbda4ba32f2450aeea928f3779179634565450214494d9ca9a29de41f63f1df536e15cac6f4ce98c3c1e32242396c2d13df740b00628d8f8b30219a
data/build.gradle CHANGED
@@ -14,7 +14,7 @@ configurations {
14
14
  provided
15
15
  }
16
16
 
17
- version = "0.1.1"
17
+ version = "0.1.2"
18
18
 
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
@@ -19,6 +19,7 @@ import org.embulk.spi.PageOutput;
19
19
  import org.embulk.spi.PageReader;
20
20
  import org.embulk.spi.Schema;
21
21
  import org.embulk.spi.type.Types;
22
+ import org.slf4j.Logger;
22
23
 
23
24
  import com.google.common.base.Optional;
24
25
  import com.google.common.collect.ImmutableList;
@@ -34,6 +35,8 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
34
35
  public class CrawlerFilterPlugin
35
36
  implements FilterPlugin
36
37
  {
38
+ private static final Logger logger = Exec.getLogger(CrawlerFilterPlugin.class);
39
+
37
40
  public interface PluginTask
38
41
  extends Task
39
42
  {
@@ -131,21 +134,10 @@ public class CrawlerFilterPlugin
131
134
  return new PageOutput() {
132
135
  private PageReader reader = new PageReader(inputSchema);
133
136
  private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
134
- private CrawlController controller = getController();
135
137
 
136
138
  @Override
137
139
  public void finish()
138
140
  {
139
- for (Object object : controller.getCrawlersLocalData()) {
140
- CrawlStat crawlStat = (CrawlStat) object;
141
- for (Map<String, Object> map : crawlStat.getPages()) {
142
- for (Column outputColumn : outputSchema.getColumns()) {
143
- final Object value = map.get(outputColumn.getName());
144
- setValue(value, outputColumn);
145
- }
146
- builder.addRecord();
147
- }
148
- }
149
141
  builder.finish();
150
142
  }
151
143
 
@@ -158,6 +150,7 @@ public class CrawlerFilterPlugin
158
150
  @Override
159
151
  public void add(Page page)
160
152
  {
153
+ CrawlController controller = getController();
161
154
  reader.setPage(page);
162
155
  while (reader.nextRecord()) {
163
156
  controller.addSeed(reader.getString(keyNameColumn));
@@ -169,6 +162,16 @@ public class CrawlerFilterPlugin
169
162
  }
170
163
  controller.setCustomData(customData);
171
164
  controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
165
+ for (Object object : controller.getCrawlersLocalData()) {
166
+ CrawlStat crawlStat = (CrawlStat) object;
167
+ for (Map<String, Object> map : crawlStat.getPages()) {
168
+ for (Column outputColumn : outputSchema.getColumns()) {
169
+ final Object value = map.get(outputColumn.getName());
170
+ setValue(value, outputColumn);
171
+ }
172
+ builder.addRecord();
173
+ }
174
+ }
172
175
  }
173
176
 
174
177
  /**
@@ -91,7 +91,7 @@ public class EmbulkCrawler extends WebCrawler
91
91
  BinaryParseData binaryParseData = (BinaryParseData) parseData;
92
92
  map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
93
93
  }
94
- logger.info("{}", webURL.getURL());
94
+ logger.debug("{}", webURL.getURL());
95
95
  myCrawlStat.pages.add(map);
96
96
  }
97
97
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-26 00:00:00.000000000 Z
11
+ date: 2016-04-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -73,7 +73,7 @@ files:
73
73
  - classpath/commons-logging-1.2.jar
74
74
  - classpath/crawler4j-4.2.jar
75
75
  - classpath/dom4j-1.6.1.jar
76
- - classpath/embulk-filter-crawler-0.1.1.jar
76
+ - classpath/embulk-filter-crawler-0.1.2.jar
77
77
  - classpath/fontbox-1.8.4.jar
78
78
  - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
79
79
  - classpath/httpclient-4.4.jar